summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorKonstantin Komarov <almaz.alexandrovich@paragon-software.com>2022-05-31 16:13:23 +0300
committerKonstantin Komarov <almaz.alexandrovich@paragon-software.com>2022-05-31 16:13:23 +0300
commit03ab8e6297acd1bc0eedaa050e2a1635c576fd11 (patch)
tree519b79a60508ae3992f0f6bf10deac237dbc45ae /fs
parent52e00ea6b26e45fb8159e3b57cdde8d3f9bdd8e9 (diff)
parent4b0986a3613c92f4ec1bdc7f60ec66fea135991f (diff)
Merge tag 'v5.18'
Linux 5.18
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig1
-rw-r--r--fs/9p/acl.c11
-rw-r--r--fs/9p/acl.h27
-rw-r--r--fs/9p/cache.c338
-rw-r--r--fs/9p/cache.h118
-rw-r--r--fs/9p/fid.c12
-rw-r--r--fs/9p/v9fs.c43
-rw-r--r--fs/9p/v9fs.h41
-rw-r--r--fs/9p/v9fs_vfs.h11
-rw-r--r--fs/9p/vfs_addr.c317
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c19
-rw-r--r--fs/9p/vfs_file.c41
-rw-r--r--fs/9p/vfs_inode.c70
-rw-r--r--fs/9p/vfs_inode_dotl.c43
-rw-r--r--fs/9p/vfs_super.c17
-rw-r--r--fs/9p/xattr.c10
-rw-r--r--fs/9p/xattr.h29
-rw-r--r--fs/Kconfig16
-rw-r--r--fs/Kconfig.binfmt13
-rw-r--r--fs/Makefile4
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c6
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c68
-rw-r--r--fs/afs/cell.c12
-rw-r--r--fs/afs/dir.c241
-rw-r--r--fs/afs/dir_edit.c154
-rw-r--r--fs/afs/dynroot.c1
-rw-r--r--fs/afs/file.c172
-rw-r--r--fs/afs/inode.c137
-rw-r--r--fs/afs/internal.h113
-rw-r--r--fs/afs/main.c14
-rw-r--r--fs/afs/proc.c6
-rw-r--r--fs/afs/super.c8
-rw-r--r--fs/afs/volume.c29
-rw-r--r--fs/afs/write.c456
-rw-r--r--fs/afs/yfsclient.c32
-rw-r--r--fs/aio.c230
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/attr.c4
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/file.c3
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c199
-rw-r--r--fs/binfmt_elf_fdpic.c22
-rw-r--r--fs/binfmt_elf_test.c64
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/block-group.c380
-rw-r--r--fs/btrfs/block-group.h13
-rw-r--r--fs/btrfs/block-rsv.c84
-rw-r--r--fs/btrfs/block-rsv.h5
-rw-r--r--fs/btrfs/btrfs_inode.h85
-rw-r--r--fs/btrfs/check-integrity.c206
-rw-r--r--fs/btrfs/compression.c742
-rw-r--r--fs/btrfs/compression.h14
-rw-r--r--fs/btrfs/ctree.c820
-rw-r--r--fs/btrfs/ctree.h352
-rw-r--r--fs/btrfs/delalloc-space.c32
-rw-r--r--fs/btrfs/delayed-inode.c44
-rw-r--r--fs/btrfs/delayed-ref.c42
-rw-r--r--fs/btrfs/delayed-ref.h51
-rw-r--r--fs/btrfs/dev-replace.c55
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c740
-rw-r--r--fs/btrfs/disk-io.h18
-rw-r--r--fs/btrfs/extent-io-tree.h4
-rw-r--r--fs/btrfs/extent-tree.c592
-rw-r--r--fs/btrfs/extent_io.c535
-rw-r--r--fs/btrfs/extent_io.h12
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c128
-rw-r--r--fs/btrfs/file.c366
-rw-r--r--fs/btrfs/free-space-cache.c346
-rw-r--r--fs/btrfs/free-space-cache.h10
-rw-r--r--fs/btrfs/free-space-tree.c56
-rw-r--r--fs/btrfs/inode-item.c344
-rw-r--r--fs/btrfs/inode-item.h96
-rw-r--r--fs/btrfs/inode.c2575
-rw-r--r--fs/btrfs/ioctl.c1673
-rw-r--r--fs/btrfs/locking.h7
-rw-r--r--fs/btrfs/lzo.c320
-rw-r--r--fs/btrfs/ordered-data.c132
-rw-r--r--fs/btrfs/ordered-data.h25
-rw-r--r--fs/btrfs/print-tree.c13
-rw-r--r--fs/btrfs/props.c66
-rw-r--r--fs/btrfs/props.h4
-rw-r--r--fs/btrfs/qgroup.c127
-rw-r--r--fs/btrfs/raid56.c175
-rw-r--r--fs/btrfs/raid56.h22
-rw-r--r--fs/btrfs/reada.c1086
-rw-r--r--fs/btrfs/ref-verify.c12
-rw-r--r--fs/btrfs/reflink.c53
-rw-r--r--fs/btrfs/relocation.c146
-rw-r--r--fs/btrfs/root-tree.c30
-rw-r--r--fs/btrfs/scrub.c395
-rw-r--r--fs/btrfs/send.c470
-rw-r--r--fs/btrfs/send.h9
-rw-r--r--fs/btrfs/space-info.c124
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/subpage.c290
-rw-r--r--fs/btrfs/subpage.h56
-rw-r--r--fs/btrfs/super.c127
-rw-r--r--fs/btrfs/sysfs.c122
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c19
-rw-r--r--fs/btrfs/tests/extent-io-tests.c64
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/tests/free-space-tests.c186
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c5
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c5
-rw-r--r--fs/btrfs/transaction.c293
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c116
-rw-r--r--fs/btrfs/tree-defrag.c8
-rw-r--r--fs/btrfs/tree-log.c2183
-rw-r--r--fs/btrfs/tree-log.h25
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/verity.c2
-rw-r--r--fs/btrfs/volumes.c959
-rw-r--r--fs/btrfs/volumes.h138
-rw-r--r--fs/btrfs/xattr.c21
-rw-r--r--fs/btrfs/zoned.c709
-rw-r--r--fs/btrfs/zoned.h63
-rw-r--r--fs/btrfs/zstd.c68
-rw-r--r--fs/buffer.c161
-rw-r--r--fs/cachefiles/Kconfig7
-rw-r--r--fs/cachefiles/Makefile6
-rw-r--r--fs/cachefiles/bind.c278
-rw-r--r--fs/cachefiles/cache.c383
-rw-r--r--fs/cachefiles/daemon.c189
-rw-r--r--fs/cachefiles/error_inject.c46
-rw-r--r--fs/cachefiles/interface.c747
-rw-r--r--fs/cachefiles/internal.h272
-rw-r--r--fs/cachefiles/io.c405
-rw-r--r--fs/cachefiles/key.c201
-rw-r--r--fs/cachefiles/main.c22
-rw-r--r--fs/cachefiles/namei.c1236
-rw-r--r--fs/cachefiles/rdwr.c972
-rw-r--r--fs/cachefiles/security.c2
-rw-r--r--fs/cachefiles/volume.c139
-rw-r--r--fs/cachefiles/xattr.c436
-rw-r--r--fs/ceph/addr.c588
-rw-r--r--fs/ceph/cache.c237
-rw-r--r--fs/ceph/cache.h103
-rw-r--r--fs/ceph/caps.c251
-rw-r--r--fs/ceph/debugfs.c166
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/export.c12
-rw-r--r--fs/ceph/file.c264
-rw-r--r--fs/ceph/inode.c149
-rw-r--r--fs/ceph/locks.c17
-rw-r--r--fs/ceph/mds_client.c217
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/mdsmap.c4
-rw-r--r--fs/ceph/metric.c191
-rw-r--r--fs/ceph/metric.h151
-rw-r--r--fs/ceph/quota.c17
-rw-r--r--fs/ceph/snap.c263
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c204
-rw-r--r--fs/ceph/super.h76
-rw-r--r--fs/ceph/xattr.c16
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cache.c105
-rw-r--r--fs/cifs/cifs_debug.c17
-rw-r--r--fs/cifs/cifs_dfs_ref.c59
-rw-r--r--fs/cifs/cifs_fs_sb.h5
-rw-r--r--fs/cifs/cifs_spnego.c4
-rw-r--r--fs/cifs/cifs_spnego.h3
-rw-r--r--fs/cifs/cifs_swn.c21
-rw-r--r--fs/cifs/cifsacl.c9
-rw-r--r--fs/cifs/cifsencrypt.c6
-rw-r--r--fs/cifs/cifsfs.c66
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h162
-rw-r--r--fs/cifs/cifspdu.h16
-rw-r--r--fs/cifs/cifsproto.h53
-rw-r--r--fs/cifs/cifssmb.c97
-rw-r--r--fs/cifs/connect.c1822
-rw-r--r--fs/cifs/dfs_cache.c58
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/file.c369
-rw-r--r--fs/cifs/fs_context.c102
-rw-r--r--fs/cifs/fs_context.h3
-rw-r--r--fs/cifs/fscache.c426
-rw-r--r--fs/cifs/fscache.h195
-rw-r--r--fs/cifs/inode.c31
-rw-r--r--fs/cifs/link.c3
-rw-r--r--fs/cifs/misc.c113
-rw-r--r--fs/cifs/netmisc.c5
-rw-r--r--fs/cifs/ntlmssp.h36
-rw-r--r--fs/cifs/sess.c534
-rw-r--r--fs/cifs/smb1ops.c22
-rw-r--r--fs/cifs/smb2glob.h13
-rw-r--r--fs/cifs/smb2inode.c22
-rw-r--r--fs/cifs/smb2maperror.c16
-rw-r--r--fs/cifs/smb2misc.c66
-rw-r--r--fs/cifs/smb2ops.c382
-rw-r--r--fs/cifs/smb2pdu.c471
-rw-r--r--fs/cifs/smb2pdu.h1447
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2transport.c103
-rw-r--r--fs/cifs/trace.h78
-rw-r--r--fs/cifs/transport.c92
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/cnode.c13
-rw-r--r--fs/coda/coda_linux.c39
-rw-r--r--fs/coda/coda_linux.h6
-rw-r--r--fs/coda/dir.c20
-rw-r--r--fs/coda/file.c13
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/coda/psdev.c14
-rw-r--r--fs/coda/upcall.c3
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/configfs/dir.c20
-rw-r--r--fs/coredump.c253
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/bio.c45
-rw-r--r--fs/crypto/crypto.c10
-rw-r--r--fs/crypto/fname.c3
-rw-r--r--fs/crypto/fscrypt_private.h16
-rw-r--r--fs/crypto/hkdf.c11
-rw-r--r--fs/crypto/inline_crypt.c93
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/dax.c160
-rw-r--r--fs/dcache.c40
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/direct-io.c24
-rw-r--r--fs/dlm/ast.c16
-rw-r--r--fs/dlm/debug_fs.c96
-rw-r--r--fs/dlm/dir.c3
-rw-r--r--fs/dlm/dlm_internal.h12
-rw-r--r--fs/dlm/lock.c109
-rw-r--r--fs/dlm/lock.h4
-rw-r--r--fs/dlm/lockspace.c41
-rw-r--r--fs/dlm/lowcomms.c209
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c3
-rw-r--r--fs/dlm/member.c3
-rw-r--r--fs/dlm/memory.c68
-rw-r--r--fs/dlm/memory.h6
-rw-r--r--fs/dlm/midcomms.c85
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/recoverd.c3
-rw-r--r--fs/dlm/requestqueue.c17
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/mmap.c5
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/erofs/Kconfig40
-rw-r--r--fs/erofs/Makefile3
-rw-r--r--fs/erofs/compress.h32
-rw-r--r--fs/erofs/data.c224
-rw-r--r--fs/erofs/decompressor.c251
-rw-r--r--fs/erofs/decompressor_lzma.c287
-rw-r--r--fs/erofs/dir.c21
-rw-r--r--fs/erofs/erofs_fs.h94
-rw-r--r--fs/erofs/inode.c74
-rw-r--r--fs/erofs/internal.h160
-rw-r--r--fs/erofs/namei.c54
-rw-r--r--fs/erofs/pcpubuf.c6
-rw-r--r--fs/erofs/super.c355
-rw-r--r--fs/erofs/sysfs.c258
-rw-r--r--fs/erofs/utils.c27
-rw-r--r--fs/erofs/xattr.c139
-rw-r--r--fs/erofs/xattr.h1
-rw-r--r--fs/erofs/zdata.c624
-rw-r--r--fs/erofs/zdata.h28
-rw-r--r--fs/erofs/zmap.c296
-rw-r--r--fs/erofs/zpvec.h13
-rw-r--r--fs/eventpoll.c10
-rw-r--r--fs/exec.c107
-rw-r--r--fs/exfat/balloc.c2
-rw-r--r--fs/exfat/dir.c42
-rw-r--r--fs/exfat/exfat_fs.h9
-rw-r--r--fs/exfat/fatent.c4
-rw-r--r--fs/exfat/file.c20
-rw-r--r--fs/exfat/inode.c18
-rw-r--r--fs/exfat/misc.c3
-rw-r--r--fs/exfat/namei.c103
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c30
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c24
-rw-r--r--fs/ext2/super.c24
-rw-r--r--fs/ext4/acl.c10
-rw-r--r--fs/ext4/balloc.c1
-rw-r--r--fs/ext4/block_validity.c26
-rw-r--r--fs/ext4/dir.c1
-rw-r--r--fs/ext4/ext4.h71
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c244
-rw-r--r--fs/ext4/fast_commit.c484
-rw-r--r--fs/ext4/fast_commit.h33
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c37
-rw-r--r--fs/ext4/inode.c607
-rw-r--r--fs/ext4/ioctl.c348
-rw-r--r--fs/ext4/mballoc.c442
-rw-r--r--fs/ext4/migrate.c25
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c47
-rw-r--r--fs/ext4/orphan.c4
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/readpage.c16
-rw-r--r--fs/ext4/resize.c26
-rw-r--r--fs/ext4/super.c2113
-rw-r--r--fs/ext4/sysfs.c44
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/Kconfig8
-rw-r--r--fs/f2fs/acl.c21
-rw-r--r--fs/f2fs/checkpoint.c101
-rw-r--r--fs/f2fs/compress.c178
-rw-r--r--fs/f2fs/data.c706
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c22
-rw-r--r--fs/f2fs/f2fs.h283
-rw-r--r--fs/f2fs/file.c662
-rw-r--r--fs/f2fs/gc.c87
-rw-r--r--fs/f2fs/hash.c2
-rw-r--r--fs/f2fs/inline.c10
-rw-r--r--fs/f2fs/inode.c34
-rw-r--r--fs/f2fs/iostat.c40
-rw-r--r--fs/f2fs/namei.c114
-rw-r--r--fs/f2fs/node.c143
-rw-r--r--fs/f2fs/node.h8
-rw-r--r--fs/f2fs/recovery.c61
-rw-r--r--fs/f2fs/segment.c276
-rw-r--r--fs/f2fs/segment.h9
-rw-r--r--fs/f2fs/super.c276
-rw-r--r--fs/f2fs/sysfs.c101
-rw-r--r--fs/f2fs/verity.c6
-rw-r--r--fs/f2fs/xattr.c54
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/file.c5
-rw-r--r--fs/fat/inode.c16
-rw-r--r--fs/fcntl.c18
-rw-r--r--fs/file.c99
-rw-r--r--fs/file_table.c54
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fs-writeback.c87
-rw-r--r--fs/fs_context.c2
-rw-r--r--fs/fs_parser.c31
-rw-r--r--fs/fscache/Makefile6
-rw-r--r--fs/fscache/cache.c618
-rw-r--r--fs/fscache/cookie.c1450
-rw-r--r--fs/fscache/fsdef.c98
-rw-r--r--fs/fscache/internal.h314
-rw-r--r--fs/fscache/io.c377
-rw-r--r--fs/fscache/main.c147
-rw-r--r--fs/fscache/netfs.c74
-rw-r--r--fs/fscache/object.c1125
-rw-r--r--fs/fscache/operation.c633
-rw-r--r--fs/fscache/page.c1242
-rw-r--r--fs/fscache/proc.c47
-rw-r--r--fs/fscache/stats.c293
-rw-r--r--fs/fscache/volume.c517
-rw-r--r--fs/fuse/Kconfig2
-rw-r--r--fs/fuse/control.c17
-rw-r--r--fs/fuse/dax.c44
-rw-r--r--fs/fuse/dev.c44
-rw-r--r--fs/fuse/dir.c221
-rw-r--r--fs/fuse/file.c150
-rw-r--r--fs/fuse/fuse_i.h51
-rw-r--r--fs/fuse/inode.c139
-rw-r--r--fs/fuse/ioctl.c15
-rw-r--r--fs/fuse/readdir.c6
-rw-r--r--fs/fuse/virtio_fs.c43
-rw-r--r--fs/fuse/xattr.c10
-rw-r--r--fs/gfs2/aops.c43
-rw-r--r--fs/gfs2/bmap.c82
-rw-r--r--fs/gfs2/file.c290
-rw-r--r--fs/gfs2/glock.c482
-rw-r--r--fs/gfs2/glock.h34
-rw-r--r--fs/gfs2/glops.c29
-rw-r--r--fs/gfs2/incore.h10
-rw-r--r--fs/gfs2/inode.c140
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/lops.c9
-rw-r--r--fs/gfs2/meta_io.c10
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/rgrp.c89
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/gfs2/trace_gfs2.h9
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/hfs/inode.c12
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/hfsplus_raw.h12
-rw-r--r--fs/hfsplus/inode.c18
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hfsplus/wrapper.c7
-rw-r--r--fs/hfsplus/xattr.c4
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hpfs/file.c3
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c43
-rw-r--r--fs/inode.c145
-rw-r--r--fs/internal.h25
-rw-r--r--fs/io-wq.c326
-rw-r--r--fs/io-wq.h83
-rw-r--r--fs/io_uring.c4342
-rw-r--r--fs/ioctl.c8
-rw-r--r--fs/iomap/Makefile4
-rw-r--r--fs/iomap/buffered-io.c674
-rw-r--r--fs/iomap/direct-io.c104
-rw-r--r--fs/iomap/fiemap.c1
-rw-r--r--fs/iomap/trace.h2
-rw-r--r--fs/isofs/inode.c4
-rw-r--r--fs/jbd2/commit.c25
-rw-r--r--fs/jbd2/journal.c14
-rw-r--r--fs/jbd2/transaction.c126
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jffs2/build.c4
-rw-r--r--fs/jffs2/file.c40
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_i.h4
-rw-r--r--fs/jffs2/scan.c6
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/inode.c6
-rw-r--r--fs/jfs/jfs_dmap.c7
-rw-r--r--fs/jfs/jfs_logmgr.c11
-rw-r--r--fs/jfs/jfs_metapage.c24
-rw-r--r--fs/jfs/jfs_mount.c51
-rw-r--r--fs/jfs/resize.c5
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/kernfs/dir.c132
-rw-r--r--fs/kernfs/file.c15
-rw-r--r--fs/kernfs/inode.c22
-rw-r--r--fs/kernfs/kernfs-internal.h19
-rw-r--r--fs/kernfs/mount.c15
-rw-r--r--fs/kernfs/symlink.c8
-rw-r--r--fs/ksmbd/Kconfig2
-rw-r--r--fs/ksmbd/asn1.c142
-rw-r--r--fs/ksmbd/auth.c65
-rw-r--r--fs/ksmbd/auth.h10
-rw-r--r--fs/ksmbd/connection.c21
-rw-r--r--fs/ksmbd/connection.h12
-rw-r--r--fs/ksmbd/ksmbd_netlink.h14
-rw-r--r--fs/ksmbd/ksmbd_work.c1
-rw-r--r--fs/ksmbd/ksmbd_work.h4
-rw-r--r--fs/ksmbd/mgmt/user_config.c10
-rw-r--r--fs/ksmbd/mgmt/user_config.h1
-rw-r--r--fs/ksmbd/mgmt/user_session.h1
-rw-r--r--fs/ksmbd/misc.c40
-rw-r--r--fs/ksmbd/misc.h3
-rw-r--r--fs/ksmbd/ndr.c2
-rw-r--r--fs/ksmbd/ntlmssp.h6
-rw-r--r--fs/ksmbd/oplock.c82
-rw-r--r--fs/ksmbd/oplock.h4
-rw-r--r--fs/ksmbd/server.c4
-rw-r--r--fs/ksmbd/smb2misc.c34
-rw-r--r--fs/ksmbd/smb2ops.c28
-rw-r--r--fs/ksmbd/smb2pdu.c1060
-rw-r--r--fs/ksmbd/smb2pdu.h1224
-rw-r--r--fs/ksmbd/smb_common.c18
-rw-r--r--fs/ksmbd/smb_common.h56
-rw-r--r--fs/ksmbd/smbacl.c19
-rw-r--r--fs/ksmbd/smbacl.h5
-rw-r--r--fs/ksmbd/transport_ipc.c2
-rw-r--r--fs/ksmbd/transport_rdma.c266
-rw-r--r--fs/ksmbd/transport_rdma.h4
-rw-r--r--fs/ksmbd/transport_tcp.c7
-rw-r--r--fs/ksmbd/vfs.c15
-rw-r--r--fs/ksmbd/vfs.h40
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/ksmbd/vfs_cache.h11
-rw-r--r--fs/ksmbd/xattr.h2
-rw-r--r--fs/libfs.c54
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/lockd/svc.c218
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/lockd/svcsubs.c18
-rw-r--r--fs/lockd/xdr.c152
-rw-r--r--fs/lockd/xdr4.c153
-rw-r--r--fs/locks.c195
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mpage.c96
-rw-r--r--fs/namei.c97
-rw-r--r--fs/namespace.c321
-rw-r--r--fs/netfs/Makefile8
-rw-r--r--fs/netfs/buffered_read.c428
-rw-r--r--fs/netfs/internal.h50
-rw-r--r--fs/netfs/io.c657
-rw-r--r--fs/netfs/main.c20
-rw-r--r--fs/netfs/objects.c160
-rw-r--r--fs/netfs/read_helper.c1208
-rw-r--r--fs/netfs/stats.c1
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c26
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfs/callback.c96
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c34
-rw-r--r--fs/nfs/callback_xdr.c26
-rw-r--r--fs/nfs/client.c53
-rw-r--r--fs/nfs/delegation.c12
-rw-r--r--fs/nfs/dir.c891
-rw-r--r--fs/nfs/direct.c54
-rw-r--r--fs/nfs/export.c46
-rw-r--r--fs/nfs/file.c78
-rw-r--r--fs/nfs/filelayout/filelayout.c4
-rw-r--r--fs/nfs/filelayout/filelayout.h2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c4
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c55
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c4
-rw-r--r--fs/nfs/fs_context.c11
-rw-r--r--fs/nfs/fscache-index.c140
-rw-r--r--fs/nfs/fscache.c507
-rw-r--r--fs/nfs/fscache.h187
-rw-r--r--fs/nfs/getroot.c21
-rw-r--r--fs/nfs/inode.c238
-rw-r--r--fs/nfs/internal.h48
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs2xdr.c3
-rw-r--r--fs/nfs/nfs3proc.c15
-rw-r--r--fs/nfs/nfs3xdr.c32
-rw-r--r--fs/nfs/nfs42proc.c60
-rw-r--r--fs/nfs/nfs42xattr.c9
-rw-r--r--fs/nfs/nfs42xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h19
-rw-r--r--fs/nfs/nfs4client.c70
-rw-r--r--fs/nfs/nfs4file.c28
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4namespace.c19
-rw-r--r--fs/nfs/nfs4proc.c561
-rw-r--r--fs/nfs/nfs4session.c12
-rw-r--r--fs/nfs/nfs4session.h1
-rw-r--r--fs/nfs/nfs4state.c75
-rw-r--r--fs/nfs/nfs4trace.h920
-rw-r--r--fs/nfs/nfs4xdr.c137
-rw-r--r--fs/nfs/nfstrace.h684
-rw-r--r--fs/nfs/pagelist.c24
-rw-r--r--fs/nfs/pnfs.c50
-rw-r--r--fs/nfs/pnfs.h8
-rw-r--r--fs/nfs/pnfs_nfs.c14
-rw-r--r--fs/nfs/proc.c17
-rw-r--r--fs/nfs/read.c59
-rw-r--r--fs/nfs/super.c35
-rw-r--r--fs/nfs/sysfs.c3
-rw-r--r--fs/nfs/unlink.c1
-rw-r--r--fs/nfs/write.c146
-rw-r--r--fs/nfsd/Kconfig13
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/blocklayout.c159
-rw-r--r--fs/nfsd/export.c2
-rw-r--r--fs/nfsd/filecache.c114
-rw-r--r--fs/nfsd/filecache.h1
-rw-r--r--fs/nfsd/flexfilelayout.c4
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/netns.h27
-rw-r--r--fs/nfsd/nfs2acl.c68
-rw-r--r--fs/nfsd/nfs3acl.c48
-rw-r--r--fs/nfsd/nfs3proc.c39
-rw-r--r--fs/nfsd/nfs3xdr.c456
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c7
-rw-r--r--fs/nfsd/nfs4proc.c48
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c102
-rw-r--r--fs/nfsd/nfs4xdr.c96
-rw-r--r--fs/nfsd/nfscache.c50
-rw-r--r--fs/nfsd/nfsctl.c62
-rw-r--r--fs/nfsd/nfsd.h10
-rw-r--r--fs/nfsd/nfsfh.c235
-rw-r--r--fs/nfsd/nfsfh.h81
-rw-r--r--fs/nfsd/nfsproc.c21
-rw-r--r--fs/nfsd/nfssvc.c269
-rw-r--r--fs/nfsd/nfsxdr.c187
-rw-r--r--fs/nfsd/state.h5
-rw-r--r--fs/nfsd/stats.c2
-rw-r--r--fs/nfsd/stats.h4
-rw-r--r--fs/nfsd/trace.h228
-rw-r--r--fs/nfsd/vfs.c194
-rw-r--r--fs/nfsd/vfs.h9
-rw-r--r--fs/nfsd/xdr.h39
-rw-r--r--fs/nfsd/xdr3.h63
-rw-r--r--fs/nfsd/xdr4.h7
-rw-r--r--fs/nilfs2/alloc.c2
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/bmap.h2
-rw-r--r--fs/nilfs2/btnode.c25
-rw-r--r--fs/nilfs2/btnode.h3
-rw-r--r--fs/nilfs2/btree.c29
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c6
-rw-r--r--fs/nilfs2/dat.h2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c2
-rw-r--r--fs/nilfs2/direct.h2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c201
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c48
-rw-r--r--fs/nilfs2/mdt.h8
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h18
-rw-r--r--fs/nilfs2/page.c22
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c65
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.c11
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c11
-rw-r--r--fs/nilfs2/sysfs.c91
-rw-r--r--fs/nilfs2/sysfs.h2
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/notify/dnotify/dnotify.c23
-rw-r--r--fs/notify/fanotify/fanotify.c328
-rw-r--r--fs/notify/fanotify/fanotify.h196
-rw-r--r--fs/notify/fanotify/fanotify_user.c294
-rw-r--r--fs/notify/fsnotify.c77
-rw-r--r--fs/notify/group.c4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/notify/mark.c35
-rw-r--r--fs/notify/notification.c14
-rw-r--r--fs/ntfs/Kconfig1
-rw-r--r--fs/ntfs/aops.c21
-rw-r--r--fs/ntfs/attrib.c2
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs/super.c8
-rw-r--r--fs/ntfs3/file.c3
-rw-r--r--fs/ntfs3/fsntfs.c36
-rw-r--r--fs/ntfs3/inode.c4
-rw-r--r--fs/ntfs3/ntfs_fs.h1
-rw-r--r--fs/ntfs3/super.c4
-rw-r--r--fs/ocfs2/alloc.c25
-rw-r--r--fs/ocfs2/aops.c32
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/masklog.c11
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dir.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c18
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c3
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c21
-rw-r--r--fs/ocfs2/filecheck.c3
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/journal.c37
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/localalloc.c6
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c25
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/stackglue.c36
-rw-r--r--fs/ocfs2/suballoc.c25
-rw-r--r--fs/ocfs2/super.c66
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c3
-rw-r--r--fs/open.c27
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/inode.c123
-rw-r--r--fs/orangefs/orangefs-bufmap.c7
-rw-r--r--fs/orangefs/orangefs-sysfs.c21
-rw-r--r--fs/orangefs/super.c7
-rw-r--r--fs/overlayfs/copy_up.c35
-rw-r--r--fs/overlayfs/dir.c3
-rw-r--r--fs/overlayfs/file.c20
-rw-r--r--fs/overlayfs/inode.c5
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c16
-rw-r--r--fs/pipe.c79
-rw-r--r--fs/posix_acl.c30
-rw-r--r--fs/proc/array.c23
-rw-r--r--fs/proc/base.c53
-rw-r--r--fs/proc/bootconfig.c2
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/generic.c6
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/page.c1
-rw-r--r--fs/proc/proc_net.c27
-rw-r--r--fs/proc/proc_sysctl.c72
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c84
-rw-r--r--fs/proc/uptime.c14
-rw-r--r--fs/proc/vmcore.c124
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/blk.c10
-rw-r--r--fs/pstore/ftrace.c46
-rw-r--r--fs/pstore/platform.c40
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/quota/quota_tree.c15
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/read_write.c39
-rw-r--r--fs/reiserfs/Kconfig10
-rw-r--r--fs/reiserfs/inode.c56
-rw-r--r--fs/reiserfs/journal.c11
-rw-r--r--fs/reiserfs/super.c18
-rw-r--r--fs/remap_range.c129
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c64
-rw-r--r--fs/seq_file.c20
-rw-r--r--fs/signalfd.c17
-rw-r--r--fs/smbfs_common/cifs_arc4.c13
-rw-r--r--fs/smbfs_common/smb2pdu.h1604
-rw-r--r--fs/smbfs_common/smbfsctl.h2
-rw-r--r--fs/splice.c24
-rw-r--r--fs/squashfs/block.c11
-rw-r--r--fs/squashfs/super.c40
-rw-r--r--fs/squashfs/zstd_wrapper.c16
-rw-r--r--fs/stat.c68
-rw-r--r--fs/super.c29
-rw-r--r--fs/sync.c72
-rw-r--r--fs/sysctls.c39
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c153
-rw-r--r--fs/sysfs/group.c15
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysfs/sysfs.h8
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c3
-rw-r--r--fs/sysv/super.c6
-rw-r--r--fs/tracefs/inode.c106
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/crypto.c1
-rw-r--r--fs/ubifs/dir.c242
-rw-r--r--fs/ubifs/file.c46
-rw-r--r--fs/ubifs/gc.c19
-rw-r--r--fs/ubifs/io.c55
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c52
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/sysfs.c154
-rw-r--r--fs/ubifs/ubifs.h37
-rw-r--r--fs/udf/dir.c32
-rw-r--r--fs/udf/file.c3
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c12
-rw-r--r--fs/udf/lowlevel.c5
-rw-r--r--fs/udf/namei.c11
-rw-r--r--fs/udf/super.c16
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/unicode/.gitignore2
-rw-r--r--fs/unicode/Kconfig7
-rw-r--r--fs/unicode/Makefile19
-rw-r--r--fs/unicode/mkutf8data.c24
-rw-r--r--fs/unicode/utf8-core.c109
-rw-r--r--fs/unicode/utf8-norm.c262
-rw-r--r--fs/unicode/utf8-selftest.c94
-rw-r--r--fs/unicode/utf8data.c_shipped (renamed from fs/unicode/utf8data.h_shipped)22
-rw-r--r--fs/unicode/utf8n.h81
-rw-r--r--fs/userfaultfd.c13
-rw-r--r--fs/vboxsf/file.c2
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/vboxsf/utils.c1
-rw-r--r--fs/verity/verify.c4
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/kmem.c3
-rw-r--r--fs/xfs/kmem.h4
-rw-r--r--fs/xfs/libxfs/xfs_ag.c4
-rw-r--r--fs/xfs/libxfs/xfs_ag.h44
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c148
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h39
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c63
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c105
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h35
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c62
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c339
-rw-r--r--fs/xfs/libxfs/xfs_btree.h99
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c8
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c11
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c241
-rw-r--r--fs/xfs/libxfs/xfs_defer.h41
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c36
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h5
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_fs.h39
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c46
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h7
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c21
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c116
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c18
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/scrub/agheader.c66
-rw-r--r--fs/xfs/scrub/agheader_repair.c20
-rw-r--r--fs/xfs/scrub/attr.h2
-rw-r--r--fs/xfs/scrub/bitmap.c22
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/btree.c121
-rw-r--r--fs/xfs/scrub/btree.h17
-rw-r--r--fs/xfs/scrub/dabtree.c62
-rw-r--r--fs/xfs/scrub/dir.c15
-rw-r--r--fs/xfs/scrub/inode.c14
-rw-r--r--fs/xfs/scrub/quota.c4
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/scrub.c68
-rw-r--r--fs/xfs/scrub/scrub.h1
-rw-r--r--fs/xfs/scrub/trace.c11
-rw-r--r--fs/xfs/scrub/trace.h10
-rw-r--r--fs/xfs/xfs_aops.c64
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_bio_io.c45
-rw-r--r--fs/xfs/xfs_bmap_item.c20
-rw-r--r--fs/xfs/xfs_bmap_item.h6
-rw-r--r--fs/xfs/xfs_bmap_util.c23
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c82
-rw-r--r--fs/xfs/xfs_buf.h47
-rw-r--r--fs/xfs/xfs_buf_item.c13
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_buf_item_recover.c4
-rw-r--r--fs/xfs/xfs_dir2_readdir.c53
-rw-r--r--fs/xfs/xfs_dquot.c107
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_extfree_item.c35
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_file.c100
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_icache.c70
-rw-r--r--fs/xfs/xfs_icreate_item.c6
-rw-r--r--fs/xfs/xfs_icreate_item.h2
-rw-r--r--fs/xfs/xfs_inode.c147
-rw-r--r--fs/xfs/xfs_inode.h17
-rw-r--r--fs/xfs/xfs_inode_item.c180
-rw-r--r--fs/xfs/xfs_inode_item.h3
-rw-r--r--fs/xfs/xfs_ioctl.c113
-rw-r--r--fs/xfs/xfs_ioctl.h11
-rw-r--r--fs/xfs/xfs_ioctl32.c29
-rw-r--r--fs/xfs/xfs_ioctl32.h22
-rw-r--r--fs/xfs/xfs_iomap.c84
-rw-r--r--fs/xfs/xfs_iomap.h12
-rw-r--r--fs/xfs/xfs_iops.c165
-rw-r--r--fs/xfs/xfs_linux.h3
-rw-r--r--fs/xfs/xfs_log.c134
-rw-r--r--fs/xfs/xfs_log_cil.c120
-rw-r--r--fs/xfs/xfs_log_priv.h16
-rw-r--r--fs/xfs/xfs_log_recover.c92
-rw-r--r--fs/xfs/xfs_mount.c27
-rw-r--r--fs/xfs/xfs_mount.h20
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_pnfs.c49
-rw-r--r--fs/xfs/xfs_qm.c10
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c11
-rw-r--r--fs/xfs/xfs_refcount_item.c20
-rw-r--r--fs/xfs/xfs_refcount_item.h6
-rw-r--r--fs/xfs/xfs_reflink.c15
-rw-r--r--fs/xfs/xfs_rmap_item.c20
-rw-r--r--fs/xfs/xfs_rmap_item.h6
-rw-r--r--fs/xfs/xfs_super.c352
-rw-r--r--fs/xfs/xfs_symlink.c33
-rw-r--r--fs/xfs/xfs_sysfs.c40
-rw-r--r--fs/xfs/xfs_trace.h10
-rw-r--r--fs/xfs/xfs_trans.c165
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_ail.c51
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/zonefs/super.c69
918 files changed, 51277 insertions, 42580 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 09fd4a185fd2..d7bc93447c85 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -2,6 +2,7 @@
config 9P_FS
tristate "Plan 9 Resource Sharing Support (9P2000)"
depends on INET && NET_9P
+ select NETFS_SUPPORT
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index c381499f5416..4dac4a0dc5f4 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
@@ -123,6 +115,7 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
char *name;
size_t size;
void *buffer;
+
if (!acl)
return 0;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index d43c8949e807..ce5175d463dd 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -1,28 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_ACL_H
#define FS_9P_ACL_H
#ifdef CONFIG_9P_FS_POSIX_ACL
-extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu);
-extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
-extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
- struct posix_acl *, struct posix_acl *);
-extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
- struct posix_acl **dpacl, struct posix_acl **pacl);
-extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+ bool rcu);
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+ struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+ struct posix_acl **dpacl, struct posix_acl **pacl);
+void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
#define v9fs_iop_get_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 1769a44f4819..1c8dc696d516 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -16,323 +16,59 @@
#include "v9fs.h"
#include "cache.h"
-#define CACHETAG_LEN 11
-
-struct fscache_netfs v9fs_cache_netfs = {
- .name = "9p",
- .version = 0,
-};
-
-/*
- * v9fs_random_cachetag - Generate a random tag to be associated
- * with a new cache session.
- *
- * The value of jiffies is used for a fairly randomly cache tag.
- */
-
-static
-int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
+ const char *dev_name)
{
- v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
- if (!v9ses->cachetag)
- return -ENOMEM;
+ struct fscache_volume *vcookie;
+ char *name, *p;
- return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
-}
-
-const struct fscache_cookie_def v9fs_cache_session_index_def = {
- .name = "9P.session",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
+ name = kasprintf(GFP_KERNEL, "9p,%s,%s",
+ dev_name, v9ses->cachetag ?: v9ses->aname);
+ if (!name)
+ return -ENOMEM;
-void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
-{
- /* If no cache session tag was specified, we generate a random one. */
- if (!v9ses->cachetag) {
- if (v9fs_random_cachetag(v9ses) < 0) {
- v9ses->fscache = NULL;
- kfree(v9ses->cachetag);
- v9ses->cachetag = NULL;
- return;
+ for (p = name; *p; p++)
+ if (*p == '/')
+ *p = ';';
+
+ vcookie = fscache_acquire_volume(name, NULL, NULL, 0);
+ p9_debug(P9_DEBUG_FSC, "session %p get volume %p (%s)\n",
+ v9ses, vcookie, name);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(name);
+ return PTR_ERR(vcookie);
}
+ pr_err("Cache volume key already in use (%s)\n", name);
+ vcookie = NULL;
}
-
- v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
- &v9fs_cache_session_index_def,
- v9ses->cachetag,
- strlen(v9ses->cachetag),
- NULL, 0,
- v9ses, 0, true);
- p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
- v9ses, v9ses->fscache);
-}
-
-void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
-{
- p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
- v9ses, v9ses->fscache);
- fscache_relinquish_cookie(v9ses->fscache, NULL, false);
- v9ses->fscache = NULL;
-}
-
-static enum
-fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
-
- if (buflen != sizeof(v9inode->qid.version))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- if (memcmp(buffer, &v9inode->qid.version,
- sizeof(v9inode->qid.version)))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- return FSCACHE_CHECKAUX_OKAY;
+ v9ses->fscache = vcookie;
+ kfree(name);
+ return 0;
}
-const struct fscache_cookie_def v9fs_cache_inode_index_def = {
- .name = "9p.inode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = v9fs_cache_inode_check_aux,
-};
-
void v9fs_cache_inode_get_cookie(struct inode *inode)
{
- struct v9fs_inode *v9inode;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
+ __le32 version;
+ __le64 path;
if (!S_ISREG(inode->i_mode))
return;
-
- v9inode = V9FS_I(inode);
- if (v9inode->fscache)
+ if (WARN_ON(v9fs_inode_cookie(v9inode)))
return;
+ version = cpu_to_le32(v9inode->qid.version);
+ path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
- &v9fs_cache_inode_index_def,
- &v9inode->qid.path,
- sizeof(v9inode->qid.path),
- &v9inode->qid.version,
- sizeof(v9inode->qid.version),
- v9inode,
- i_size_read(&v9inode->vfs_inode),
- true);
+ v9inode->netfs_ctx.cache =
+ fscache_acquire_cookie(v9fs_session_cache(v9ses),
+ 0,
+ &path, sizeof(path),
+ &version, sizeof(version),
+ i_size_read(&v9inode->vfs_inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
- inode, v9inode->fscache);
-}
-
-void v9fs_cache_inode_put_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
- p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
- inode, v9inode->fscache);
-
- fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version,
- false);
- v9inode->fscache = NULL;
-}
-
-void v9fs_cache_inode_flush_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
- p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
- inode, v9inode->fscache);
-
- fscache_relinquish_cookie(v9inode->fscache, NULL, true);
- v9inode->fscache = NULL;
-}
-
-void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
-
- mutex_lock(&v9inode->fscache_lock);
-
- if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
- v9fs_cache_inode_flush_cookie(inode);
- else
- v9fs_cache_inode_get_cookie(inode);
-
- mutex_unlock(&v9inode->fscache_lock);
-}
-
-void v9fs_cache_inode_reset_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct v9fs_session_info *v9ses;
- struct fscache_cookie *old;
-
- if (!v9inode->fscache)
- return;
-
- old = v9inode->fscache;
-
- mutex_lock(&v9inode->fscache_lock);
- fscache_relinquish_cookie(v9inode->fscache, NULL, true);
-
- v9ses = v9fs_inode2v9ses(inode);
- v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
- &v9fs_cache_inode_index_def,
- &v9inode->qid.path,
- sizeof(v9inode->qid.path),
- &v9inode->qid.version,
- sizeof(v9inode->qid.version),
- v9inode,
- i_size_read(&v9inode->vfs_inode),
- true);
- p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
- inode, old, v9inode->fscache);
-
- mutex_unlock(&v9inode->fscache_lock);
-}
-
-int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- return fscache_maybe_release_page(v9inode->fscache, page, gfp);
-}
-
-void __v9fs_fscache_invalidate_page(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- if (PageFsCache(page)) {
- fscache_wait_on_page_write(v9inode->fscache, page);
- BUG_ON(!PageLocked(page));
- fscache_uncache_page(v9inode->fscache, page);
- }
-}
-
-static void v9fs_vfs_readpage_complete(struct page *page, void *data,
- int error)
-{
- if (!error)
- SetPageUptodate(page);
-
- unlock_page(page);
-}
-
-/*
- * __v9fs_readpage_from_fscache - read a page from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_page(v9inode->fscache,
- page,
- v9fs_vfs_readpage_complete,
- NULL,
- GFP_KERNEL);
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
- return 1;
- case 0:
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/*
- * __v9fs_readpages_from_fscache - read multiple pages from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_pages(v9inode->fscache,
- mapping, pages, nr_pages,
- v9fs_vfs_readpage_complete,
- NULL,
- mapping_gfp_mask(mapping));
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
- return 1;
- case 0:
- BUG_ON(!list_empty(pages));
- BUG_ON(*nr_pages != 0);
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/*
- * __v9fs_readpage_to_fscache - write a page to the cache
- *
- */
-
-void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- ret = fscache_write_page(v9inode->fscache, page,
- i_size_read(&v9inode->vfs_inode), GFP_KERNEL);
- p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
- if (ret != 0)
- v9fs_uncache_page(inode, page);
-}
-
-/*
- * wait for a page to complete writing to the cache
- */
-void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
-{
- const struct v9fs_inode *v9inode = V9FS_I(inode);
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (PageFsCache(page))
- fscache_wait_on_page_write(v9inode->fscache, page);
+ inode, v9fs_inode_cookie(v9inode));
}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 00f107af443e..1923affcdc62 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -7,83 +7,15 @@
#ifndef _9P_CACHE_H
#define _9P_CACHE_H
-#ifdef CONFIG_9P_FSCACHE
+
#include <linux/fscache.h>
-#include <linux/spinlock.h>
-extern struct fscache_netfs v9fs_cache_netfs;
-extern const struct fscache_cookie_def v9fs_cache_session_index_def;
-extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+#ifdef CONFIG_9P_FSCACHE
-extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
-extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
+ const char *dev_name);
extern void v9fs_cache_inode_get_cookie(struct inode *inode);
-extern void v9fs_cache_inode_put_cookie(struct inode *inode);
-extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
-extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
-extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
-
-extern int __v9fs_cache_register(void);
-extern void __v9fs_cache_unregister(void);
-
-extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
-extern void __v9fs_fscache_invalidate_page(struct page *page);
-extern int __v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page);
-extern int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages);
-extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
-extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page);
-
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp)
-{
- return __v9fs_fscache_release_page(page, gfp);
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page)
-{
- __v9fs_fscache_invalidate_page(page);
-}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return __v9fs_readpage_from_fscache(inode, page);
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return __v9fs_readpages_from_fscache(inode, mapping, pages,
- nr_pages);
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{
- if (PageFsCache(page))
- __v9fs_readpage_to_fscache(inode, page);
-}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- fscache_uncache_page(v9inode->fscache, page);
- BUG_ON(PageFsCache(page));
-}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return __v9fs_fscache_wait_on_page_write(inode, page);
-}
#else /* CONFIG_9P_FSCACHE */
@@ -91,47 +23,5 @@ static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
{
}
-static inline void v9fs_cache_inode_put_cookie(struct inode *inode)
-{
-}
-
-static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file)
-{
-}
-
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp) {
- return 1;
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page) {}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return -ENOBUFS;
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return;
-}
-
#endif /* CONFIG_9P_FSCACHE */
#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b8863dd0de5c..79df61fe0e59 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -96,13 +96,10 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
dentry, dentry, from_kuid(&init_user_ns, uid),
any);
ret = NULL;
-
- if (d_inode(dentry))
- ret = v9fs_fid_find_inode(d_inode(dentry), uid);
-
/* we'll recheck under lock if there's anything to look in */
- if (!ret && dentry->d_fsdata) {
+ if (dentry->d_fsdata) {
struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+
spin_lock(&dentry->d_lock);
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
@@ -112,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
}
}
spin_unlock(&dentry->d_lock);
+ } else {
+ if (dentry->d_inode)
+ ret = v9fs_fid_find_inode(dentry->d_inode, uid);
}
return ret;
@@ -185,7 +185,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
return ERR_PTR(-EPERM);
if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
- uname = NULL;
+ uname = NULL;
else
uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2e0fa7c932db..e28ddf763b3b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/v9fs.c
- *
* This file contains functions assisting in mapping VFS to 9P2000
*
* Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -166,7 +164,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
substring_t args[MAX_OPT_ARGS];
char *p;
int option = 0;
- char *s, *e;
+ char *s;
int ret = 0;
/* setup defaults */
@@ -190,8 +188,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token, r;
+
if (!*p)
continue;
+
token = match_token(p, tokens, args);
switch (token) {
case Opt_debug:
@@ -321,12 +321,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->flags |= V9FS_ACCESS_CLIENT;
} else {
uid_t uid;
+
v9ses->flags |= V9FS_ACCESS_SINGLE;
- uid = simple_strtoul(s, &e, 10);
- if (*e != '\0') {
- ret = -EINVAL;
- pr_info("Unknown access argument %s\n",
- s);
+ r = kstrtouint(s, 10, &uid);
+ if (r) {
+ ret = r;
+ pr_info("Unknown access argument %s: %d\n",
+ s, r);
kfree(s);
continue;
}
@@ -468,7 +469,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
- v9fs_cache_session_get_cookie(v9ses);
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
+ if (rc < 0)
+ goto err_clnt;
+ }
#endif
spin_lock(&v9fs_sessionlist_lock);
list_add(&v9ses->slist, &v9fs_sessionlist);
@@ -501,8 +506,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
}
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->fscache)
- v9fs_cache_session_put_cookie(v9ses);
+ fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false);
kfree(v9ses->cachetag);
#endif
kfree(v9ses->uname);
@@ -520,7 +524,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
* mark transport as disconnected and cancel all pending requests.
*/
-void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+void v9fs_session_cancel(struct v9fs_session_info *v9ses)
+{
p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
p9_client_disconnect(v9ses->clnt);
}
@@ -618,9 +623,7 @@ static void v9fs_sysfs_cleanup(void)
static void v9fs_inode_init_once(void *foo)
{
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
-#endif
+
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
inode_init_once(&v9inode->vfs_inode);
}
@@ -659,23 +662,16 @@ static void v9fs_destroy_inode_cache(void)
static int v9fs_cache_register(void)
{
int ret;
+
ret = v9fs_init_inode_cache();
if (ret < 0)
return ret;
-#ifdef CONFIG_9P_FSCACHE
- ret = fscache_register_netfs(&v9fs_cache_netfs);
- if (ret < 0)
- v9fs_destroy_inode_cache();
-#endif
return ret;
}
static void v9fs_cache_unregister(void)
{
v9fs_destroy_inode_cache();
-#ifdef CONFIG_9P_FSCACHE
- fscache_unregister_netfs(&v9fs_cache_netfs);
-#endif
}
/**
@@ -686,6 +682,7 @@ static void v9fs_cache_unregister(void)
static int __init init_v9fs(void)
{
int err;
+
pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4ca56c5dd637..ec0e8df3b2eb 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -9,6 +9,7 @@
#define FS_9P_V9FS_H
#include <linux/backing-dev.h>
+#include <linux/netfs.h>
/**
* enum p9_session_flags - option flags for each 9P session
@@ -89,7 +90,7 @@ struct v9fs_session_info {
unsigned int cache;
#ifdef CONFIG_9P_FSCACHE
char *cachetag;
- struct fscache_cookie *fscache;
+ struct fscache_volume *fscache;
#endif
char *uname; /* user name to mount as */
@@ -108,15 +109,15 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
-#ifdef CONFIG_9P_FSCACHE
- struct mutex fscache_lock;
- struct fscache_cookie *fscache;
-#endif
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
struct mutex v_mutex;
- struct inode vfs_inode;
};
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
@@ -124,15 +125,34 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
return container_of(inode, struct v9fs_inode, vfs_inode);
}
+static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
+{
+#ifdef CONFIG_9P_FSCACHE
+ return netfs_i_cookie(&v9inode->vfs_inode);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses)
+{
+#ifdef CONFIG_9P_FSCACHE
+ return v9ses->fscache;
+#else
+ return NULL;
+#endif
+}
+
+
extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
-struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
- char *);
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
+ const char *dev_name, char *data);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags);
+ unsigned int flags);
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct user_namespace *mnt_userns,
@@ -145,6 +165,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern const struct netfs_request_ops v9fs_req_ops;
extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
struct p9_fid *fid,
struct super_block *sb, int new);
@@ -158,7 +179,7 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
- return (inode->i_sb->s_fs_info);
+ return inode->i_sb->s_fs_info;
}
static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d44ade76966a..bc417da7e9c1 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,9 +44,10 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
+ dev_t rdev);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t);
+ struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
@@ -59,8 +60,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
void v9fs_blank_wstat(struct p9_wstat *wstat);
-int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *,
- struct iattr *);
+int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr);
int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
int datasync);
int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
@@ -68,9 +69,9 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
static inline void v9fs_invalidate_inode_attr(struct inode *inode)
{
struct v9fs_inode *v9inode;
+
v9inode = V9FS_I(inode);
v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
- return;
}
int v9fs_open_to_dotl_flags(int flags);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 1c4f1b39cc95..501128188343 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_addr.c
- *
* This file contians vfs address (mmap) ops for 9P2000.
*
* Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -18,8 +16,9 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/swap.h>
#include <linux/uio.h>
-#include <linux/bvec.h>
+#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -29,90 +28,77 @@
#include "fid.h"
/**
- * v9fs_fid_readpage - read an entire page in from 9P
- * @data: Opaque pointer to the fid being read
- * @page: structure to page
- *
+ * v9fs_issue_read - Issue a read from 9P
+ * @subreq: The read to make
*/
-static int v9fs_fid_readpage(void *data, struct page *page)
+static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct p9_fid *fid = data;
- struct inode *inode = page->mapping->host;
- struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct p9_fid *fid = rreq->netfs_priv;
struct iov_iter to;
- int retval, err;
-
- p9_debug(P9_DEBUG_VFS, "\n");
-
- BUG_ON(!PageLocked(page));
+ loff_t pos = subreq->start + subreq->transferred;
+ size_t len = subreq->len - subreq->transferred;
+ int total, err;
- retval = v9fs_readpage_from_fscache(inode, page);
- if (retval == 0)
- return retval;
-
- iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
-
- retval = p9_client_read(fid, page_offset(page), &to, &err);
- if (err) {
- v9fs_uncache_page(inode, page);
- retval = err;
- goto done;
- }
+ iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
- zero_user(page, retval, PAGE_SIZE - retval);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ total = p9_client_read(fid, pos, &to, &err);
- v9fs_readpage_to_fscache(inode, page);
- retval = 0;
+ /* if we just extended the file size, any portion not in
+ * cache won't be on server and is zeroes */
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
-done:
- unlock_page(page);
- return retval;
+ netfs_subreq_terminated(subreq, err ?: total, false);
}
/**
- * v9fs_vfs_readpage - read an entire page in from 9P
- *
- * @filp: file being read
- * @page: structure to page
- *
+ * v9fs_init_request - Initialise a read request
+ * @rreq: The read request
+ * @file: The file being read from
*/
-
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- return v9fs_fid_readpage(filp->private_data, page);
+ struct p9_fid *fid = file->private_data;
+
+ refcount_inc(&fid->count);
+ rreq->netfs_priv = fid;
+ return 0;
}
/**
- * v9fs_vfs_readpages - read a set of pages from 9P
- *
- * @filp: file being read
- * @mapping: the address space
- * @pages: list of pages to read
- * @nr_pages: count of pages to read
- *
+ * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request
+ * @mapping: unused mapping of request to cleanup
+ * @priv: private data to cleanup, a fid, guaranted non-null.
*/
-
-static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
{
- int ret = 0;
- struct inode *inode;
+ struct p9_fid *fid = priv;
- inode = mapping->host;
- p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+ p9_client_clunk(fid);
+}
- ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
- if (ret == 0)
- return ret;
+/**
+ * v9fs_begin_cache_operation - Begin a cache operation for a read
+ * @rreq: The read request
+ */
+static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+#ifdef CONFIG_9P_FSCACHE
+ struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
- ret = read_cache_pages(mapping, pages, v9fs_fid_readpage,
- filp->private_data);
- p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
- return ret;
+ return fscache_begin_read_operation(&rreq->cache_resources, cookie);
+#else
+ return -ENOBUFS;
+#endif
}
+const struct netfs_request_ops v9fs_req_ops = {
+ .init_request = v9fs_init_request,
+ .begin_cache_operation = v9fs_begin_cache_operation,
+ .issue_read = v9fs_issue_read,
+ .cleanup = v9fs_req_cleanup,
+};
+
/**
* v9fs_release_page - release the private state associated with a page
* @page: The page to be released
@@ -123,99 +109,114 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
static int v9fs_release_page(struct page *page, gfp_t gfp)
{
- if (PagePrivate(page))
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio_inode(folio);
+
+ if (folio_test_private(folio))
return 0;
- return v9fs_fscache_release_page(page, gfp);
+#ifdef CONFIG_9P_FSCACHE
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return 0;
+ folio_wait_fscache(folio);
+ }
+#endif
+ fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
+ return 1;
}
-/**
- * v9fs_invalidate_page - Invalidate a page completely or partially
- * @page: The page to be invalidated
- * @offset: offset of the invalidated region
- * @length: length of the invalidated region
- */
+static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
+{
+ folio_wait_fscache(folio);
+}
-static void v9fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
+ bool was_async)
{
- /*
- * If called with zero offset, we should release
- * the private state assocated with the page
- */
- if (offset == 0 && length == PAGE_SIZE)
- v9fs_fscache_invalidate_page(page);
+ struct v9fs_inode *v9inode = priv;
+ __le32 version;
+
+ if (IS_ERR_VALUE(transferred_or_error) &&
+ transferred_or_error != -ENOBUFS) {
+ version = cpu_to_le32(v9inode->qid.version);
+ fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
+ i_size_read(&v9inode->vfs_inode), 0);
+ }
}
-static int v9fs_vfs_writepage_locked(struct page *page)
+static int v9fs_vfs_write_folio_locked(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio_inode(folio);
struct v9fs_inode *v9inode = V9FS_I(inode);
- loff_t size = i_size_read(inode);
+ struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode);
+ loff_t start = folio_pos(folio);
+ loff_t i_size = i_size_read(inode);
struct iov_iter from;
- struct bio_vec bvec;
- int err, len;
+ size_t len = folio_size(folio);
+ int err;
+
+ if (start >= i_size)
+ return 0; /* Simultaneous truncation occurred */
- if (page->index == size >> PAGE_SHIFT)
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ len = min_t(loff_t, i_size - start, len);
- bvec.bv_page = page;
- bvec.bv_offset = 0;
- bvec.bv_len = len;
- iov_iter_bvec(&from, WRITE, &bvec, 1, len);
+ iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
- set_page_writeback(page);
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
- p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
+ p9_client_write(v9inode->writeback_fid, start, &from, &err);
- end_page_writeback(page);
+ if (err == 0 &&
+ fscache_cookie_enabled(cookie) &&
+ test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
+ folio_start_fscache(folio);
+ fscache_write_to_cache(v9fs_inode_cookie(v9inode),
+ folio_mapping(folio), start, len, i_size,
+ v9fs_write_to_cache_done, v9inode,
+ true);
+ }
+
+ folio_end_writeback(folio);
return err;
}
static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int retval;
- p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+ p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
- retval = v9fs_vfs_writepage_locked(page);
+ retval = v9fs_vfs_write_folio_locked(folio);
if (retval < 0) {
if (retval == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
retval = 0;
} else {
- SetPageError(page);
- mapping_set_error(page->mapping, retval);
+ mapping_set_error(folio_mapping(folio), retval);
}
} else
retval = 0;
- unlock_page(page);
+ folio_unlock(folio);
return retval;
}
-/**
- * v9fs_launder_page - Writeback a dirty page
- * @page: The page to be cleaned up
- *
- * Returns 0 on success.
- */
-
-static int v9fs_launder_page(struct page *page)
+static int v9fs_launder_folio(struct folio *folio)
{
int retval;
- struct inode *inode = page->mapping->host;
- v9fs_fscache_wait_on_page_write(inode, page);
- if (clear_page_dirty_for_io(page)) {
- retval = v9fs_vfs_writepage_locked(page);
+ if (folio_clear_dirty_for_io(folio)) {
+ retval = v9fs_vfs_write_folio_locked(folio);
if (retval)
return retval;
}
+ folio_wait_fscache(folio);
return 0;
}
@@ -242,11 +243,13 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t pos = iocb->ki_pos;
ssize_t n;
int err = 0;
+
if (iov_iter_rw(iter) == WRITE) {
n = p9_client_write(file->private_data, pos, iter, &err);
if (n) {
struct inode *inode = file_inode(file);
loff_t i_size = i_size_read(inode);
+
if (pos + n > i_size)
inode_add_bytes(inode, pos + n - i_size);
}
@@ -257,58 +260,49 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **subpagep, void **fsdata)
{
- int retval = 0;
- struct page *page;
- struct v9fs_inode *v9inode;
- pgoff_t index = pos >> PAGE_SHIFT;
- struct inode *inode = mapping->host;
-
+ int retval;
+ struct folio *folio;
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- v9inode = V9FS_I(inode);
-start:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
- retval = -ENOMEM;
- goto out;
- }
BUG_ON(!v9inode->writeback_fid);
- if (PageUptodate(page))
- goto out;
- if (len == PAGE_SIZE)
- goto out;
+ /* Prefetch area to be written into the cache if we're caching this
+ * file. We need to do this before we get a lock on the page in case
+ * there's more than one writer competing for the same cache block.
+ */
+ retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata);
+ if (retval < 0)
+ return retval;
- retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
- put_page(page);
- if (!retval)
- goto start;
-out:
- *pagep = page;
+ *subpagep = &folio->page;
return retval;
}
static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *subpage, void *fsdata)
{
loff_t last_pos = pos + copied;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(subpage);
+ struct inode *inode = mapping->host;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (unlikely(copied < len)) {
copied = 0;
goto out;
- } else if (len == PAGE_SIZE) {
- SetPageUptodate(page);
}
+
+ folio_mark_uptodate(folio);
}
+
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
@@ -316,25 +310,40 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
if (last_pos > inode->i_size) {
inode_add_bytes(inode, last_pos - inode->i_size);
i_size_write(inode, last_pos);
+ fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos);
}
- set_page_dirty(page);
+ folio_mark_dirty(folio);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
+#ifdef CONFIG_9P_FSCACHE
+/*
+ * Mark a page as having been made dirty and thus needing writeback. We also
+ * need to pin the cache object to write back to.
+ */
+static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
+
+ return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
+}
+#else
+#define v9fs_dirty_folio filemap_dirty_folio
+#endif
const struct address_space_operations v9fs_addr_operations = {
- .readpage = v9fs_vfs_readpage,
- .readpages = v9fs_vfs_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
+ .dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
.releasepage = v9fs_release_page,
- .invalidatepage = v9fs_invalidate_page,
- .launder_page = v9fs_launder_page,
+ .invalidate_folio = v9fs_invalidate_folio,
+ .launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 4b4292123b3d..1c609e99d280 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dentry.c
- *
* This file contians vfs dentry ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -52,6 +50,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
+
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
@@ -76,6 +75,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
int retval;
struct v9fs_session_info *v9ses;
+
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b6a5a0be444d..958680f7f23e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dir.c
- *
* This file contains vfs directory ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -19,6 +17,7 @@
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/uio.h>
+#include <linux/fscache.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -71,6 +70,7 @@ static inline int dt_type(struct p9_wstat *mistat)
static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
struct p9_fid *fid = filp->private_data;
+
if (!fid->rdir)
fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
return fid->rdir;
@@ -108,6 +108,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (rdir->tail == rdir->head) {
struct iov_iter to;
int n;
+
iov_iter_kvec(&to, READ, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
@@ -205,7 +206,10 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
int v9fs_dir_release(struct inode *inode, struct file *filp)
{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid;
+ __le32 version;
+ loff_t i_size;
fid = filp->private_data;
p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
@@ -216,6 +220,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_unlock(&inode->i_lock);
p9_client_clunk(fid);
}
+
+ if ((filp->f_mode & FMODE_WRITE)) {
+ version = cpu_to_le32(v9inode->qid.version);
+ i_size = i_size_read(inode);
+ fscache_unuse_cookie(v9fs_inode_cookie(v9inode),
+ &version, &i_size);
+ } else {
+ fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL);
+ }
return 0;
}
@@ -233,5 +246,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
.iterate_shared = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
- .fsync = v9fs_file_fsync_dotl,
+ .fsync = v9fs_file_fsync_dotl,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 246235ebdb70..2573c08f335c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_file.c
- *
* This file contians vfs file ops for 9P2000.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -95,7 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
}
mutex_unlock(&v9inode->v_mutex);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(inode, file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, fid);
return 0;
out_error:
@@ -116,7 +115,6 @@ out_error:
static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- int res = 0;
struct inode *inode = file_inode(filp);
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
@@ -126,7 +124,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
- return res;
+ return 0;
}
static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
@@ -141,8 +139,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
fid = filp->private_data;
BUG_ON(fid == NULL);
- if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
- BUG();
+ BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX);
res = locks_lock_file_wait(filp, fl);
if (res < 0)
@@ -408,6 +405,7 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
loff_t i_size;
unsigned long pg_start, pg_end;
+
pg_start = origin >> PAGE_SHIFT;
pg_end = (origin + retval - 1) >> PAGE_SHIFT;
if (inode->i_mapping && inode->i_mapping->nrpages)
@@ -529,29 +527,38 @@ static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
struct v9fs_inode *v9inode;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct file *filp = vmf->vma->vm_file;
struct inode *inode = file_inode(filp);
- p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
- page, (unsigned long)filp->private_data);
+ p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
+ folio, (unsigned long)filp->private_data);
+
+ v9inode = V9FS_I(inode);
+
+ /* Wait for the page to be written to the cache before we allow it to
+ * be modified. We then assume the entire page will need writing back.
+ */
+#ifdef CONFIG_9P_FSCACHE
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0)
+ return VM_FAULT_NOPAGE;
+#endif
/* Update file times before taking page lock */
file_update_time(filp);
- v9inode = V9FS_I(inode);
- /* make sure the cache has finished storing the page */
- v9fs_fscache_wait_on_page_write(inode, page);
BUG_ON(!v9inode->writeback_fid);
- lock_page(page);
- if (page->mapping != inode->i_mapping)
+ if (folio_lock_killable(folio) < 0)
+ return VM_FAULT_RETRY;
+ if (folio_mapping(folio) != inode->i_mapping)
goto out_unlock;
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
return VM_FAULT_LOCKED;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return VM_FAULT_NOPAGE;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 08f48b70a741..55367ecb9442 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode.c
- *
* This file contains vfs inode ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -49,6 +47,7 @@ static const struct inode_operations v9fs_symlink_inode_operations;
static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
{
int res;
+
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
@@ -110,7 +109,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses,
static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
struct p9_wstat *stat, dev_t *rdev)
{
- int res;
+ int res, r;
u32 mode = stat->mode;
*rdev = 0;
@@ -128,11 +127,16 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
res |= S_IFIFO;
else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0)) {
- char type = 0, ext[32];
+ char type = 0;
int major = -1, minor = -1;
- strlcpy(ext, stat->extension, sizeof(ext));
- sscanf(ext, "%c %i %i", &type, &major, &minor);
+ r = sscanf(stat->extension, "%c %i %i", &type, &major, &minor);
+ if (r != 3) {
+ p9_debug(P9_DEBUG_ERROR,
+ "invalid device string, umode will be bogus: %s\n",
+ stat->extension);
+ return res;
+ }
switch (type) {
case 'c':
res |= S_IFCHR;
@@ -223,13 +227,10 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
+
+ v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
- mutex_init(&v9inode->fscache_lock);
-#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
@@ -246,12 +247,20 @@ void v9fs_free_inode(struct inode *inode)
kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
}
+/*
+ * Set parameters for the netfs library
+ */
+static void v9fs_set_netfs_context(struct inode *inode)
+{
+ netfs_i_context_init(inode, &v9fs_req_ops);
+}
+
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev)
{
int err = 0;
- inode_init_owner(&init_user_ns,inode, NULL, mode);
+ inode_init_owner(&init_user_ns, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -334,6 +343,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
+
+ v9fs_set_netfs_context(inode);
error:
return err;
@@ -376,12 +387,16 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
+ __le32 version;
truncate_inode_pages_final(&inode->i_data);
+ version = cpu_to_le32(v9inode->qid.version);
+ fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
+ &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
- v9fs_cache_inode_put_cookie(inode);
+ fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
/* clunk the fid stashed in writeback_fid */
if (v9inode->writeback_fid) {
p9_client_clunk(v9inode->writeback_fid);
@@ -440,7 +455,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode;
@@ -499,8 +514,10 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
static int v9fs_at_to_dotl_flags(int flags)
{
int rflags = 0;
+
if (flags & AT_REMOVEDIR)
rflags |= P9_DOTL_AT_REMOVEDIR;
+
return rflags;
}
@@ -797,7 +814,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t mode)
+ struct file *file, unsigned int flags, umode_t mode)
{
int err;
u32 perm;
@@ -862,7 +879,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
file->private_data = fid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(d_inode(dentry), file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, fid);
file->f_mode |= FMODE_CREATED;
@@ -1065,6 +1083,8 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
int retval, use_dentry = 0;
+ struct inode *inode = d_inode(dentry);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL;
struct p9_wstat wstat;
@@ -1084,7 +1104,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
fid = v9fs_fid_lookup(dentry);
use_dentry = 1;
}
- if(IS_ERR(fid))
+ if (IS_ERR(fid))
return PTR_ERR(fid);
v9fs_blank_wstat(&wstat);
@@ -1110,7 +1130,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
/* Write all dirty data */
if (d_is_reg(dentry))
- filemap_write_and_wait(d_inode(dentry)->i_mapping);
+ filemap_write_and_wait(inode->i_mapping);
retval = p9_client_wstat(fid, &wstat);
@@ -1121,13 +1141,15 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
return retval;
if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(d_inode(dentry)))
- truncate_setsize(d_inode(dentry), iattr->ia_size);
+ iattr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, iattr->ia_size);
+ fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+ }
- v9fs_invalidate_inode_attr(d_inode(dentry));
+ v9fs_invalidate_inode_attr(inode);
- setattr_copy(&init_user_ns, d_inode(dentry), iattr);
- mark_inode_dirty(d_inode(dentry));
+ setattr_copy(&init_user_ns, inode, iattr);
+ mark_inode_dirty(inode);
return 0;
}
@@ -1364,7 +1386,7 @@ v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
u32 perm;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 01b9e1281a29..d17502a738a9 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode_dotl.c
- *
* This file contains vfs inode ops for the 9P2000.L protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -107,7 +105,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode_dotl;
@@ -230,7 +228,7 @@ v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
static int
v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t omode)
+ struct file *file, unsigned int flags, umode_t omode)
{
int err = 0;
kgid_t gid;
@@ -261,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
name = dentry->d_name.name;
- p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+ p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n",
name, flags, omode);
dfid = v9fs_parent_fid(dentry);
@@ -346,7 +344,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
goto err_clunk_old_fid;
file->private_data = ofid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(inode, file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, ofid);
file->f_mode |= FMODE_CREATED;
out:
@@ -553,7 +552,10 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
{
int retval, use_dentry = 0;
struct p9_fid *fid = NULL;
- struct p9_iattr_dotl p9attr;
+ struct p9_iattr_dotl p9attr = {
+ .uid = INVALID_UID,
+ .gid = INVALID_GID,
+ };
struct inode *inode = d_inode(dentry);
p9_debug(P9_DEBUG_VFS, "\n");
@@ -563,14 +565,22 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
return retval;
p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
- p9attr.mode = iattr->ia_mode;
- p9attr.uid = iattr->ia_uid;
- p9attr.gid = iattr->ia_gid;
- p9attr.size = iattr->ia_size;
- p9attr.atime_sec = iattr->ia_atime.tv_sec;
- p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
- p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
- p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+ if (iattr->ia_valid & ATTR_MODE)
+ p9attr.mode = iattr->ia_mode;
+ if (iattr->ia_valid & ATTR_UID)
+ p9attr.uid = iattr->ia_uid;
+ if (iattr->ia_valid & ATTR_GID)
+ p9attr.gid = iattr->ia_gid;
+ if (iattr->ia_valid & ATTR_SIZE)
+ p9attr.size = iattr->ia_size;
+ if (iattr->ia_valid & ATTR_ATIME_SET) {
+ p9attr.atime_sec = iattr->ia_atime.tv_sec;
+ p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_MTIME_SET) {
+ p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+ p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+ }
if (iattr->ia_valid & ATTR_FILE) {
fid = iattr->ia_file->private_data;
@@ -807,6 +817,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
/* Get the latest stat info from server. */
struct p9_fid *fid;
+
fid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -843,7 +854,7 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct p9_qid qid;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5fce6e30bc5a..97e23b4e6982 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -1,9 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_super.c
- *
- * This file contians superblock ops for 9P2000. It is intended that
- * you mount this file system on directories.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
@@ -24,6 +20,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/magic.h>
+#include <linux/fscache.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -83,6 +80,9 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (!v9ses->cache) {
sb->s_bdi->ra_pages = 0;
sb->s_bdi->io_pages = 0;
+ } else {
+ sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT;
+ sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
}
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
@@ -113,7 +113,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- umode_t mode = S_IRWXUGO | S_ISVTX;
+ umode_t mode = 0777 | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -157,6 +157,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
sb->s_root = root;
if (v9fs_proto_dotl(v9ses)) {
struct p9_stat_dotl *st = NULL;
+
st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -167,6 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
kfree(st);
} else {
struct p9_wstat *st = NULL;
+
st = p9_client_stat(fid);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -275,12 +277,13 @@ done:
static int v9fs_drop_inode(struct inode *inode)
{
struct v9fs_session_info *v9ses;
+
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return generic_drop_inode(inode);
/*
* in case of non cached mode always drop the
- * the inode because we want the inode attribute
+ * inode because we want the inode attribute
* to always match that on the server.
*/
return 1;
@@ -307,6 +310,7 @@ static int v9fs_write_inode(struct inode *inode,
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
+ fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
return 0;
}
@@ -330,6 +334,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
+ fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
return 0;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index ee331845e2c7..a824441b95a2 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index c63c3bea5de5..3e11fc3331eb 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_XATTR_H
#define FS_9P_XATTR_H
@@ -22,13 +14,14 @@ extern const struct xattr_handler *v9fs_xattr_handlers[];
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
-extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
- void *, size_t);
-extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
- void *, size_t);
-extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
- const void *, size_t, int);
-extern int v9fs_xattr_set(struct dentry *, const char *,
- const void *, size_t, int);
-extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+ void *buffer, size_t buffer_size);
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t buffer_size);
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+ const void *value, size_t value_len, int flags);
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t value_len, int flags);
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
#endif /* FS_9P_XATTR_H */
diff --git a/fs/Kconfig b/fs/Kconfig
index a6313a969bc5..30b751c7f11a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER
Enable this to perform validation of the parameter description for a
filesystem when it is registered.
-if BLOCK
-
config FS_IOMAP
bool
+if BLOCK
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
@@ -42,11 +42,13 @@ source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/zonefs/Kconfig"
+endif # BLOCK
+
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
- select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
+ depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
help
@@ -89,8 +91,6 @@ config FS_DAX_PMD
config FS_DAX_LIMITED
bool
-endif # BLOCK
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
@@ -344,7 +344,7 @@ config LOCKD
config LOCKD_V4
bool
- depends on NFSD_V3 || NFS_V3
+ depends on NFSD || NFS_V3
depends on FILE_LOCKING
default y
@@ -369,8 +369,8 @@ source "fs/ksmbd/Kconfig"
config SMBFS_COMMON
tristate
- default y if CIFS=y
- default m if CIFS=m
+ default y if CIFS=y || SMB_SERVER=y
+ default m if CIFS=m || SMB_SERVER=m
source "fs/coda/Kconfig"
source "fs/afs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4d5ae61580aa..21c6332fa785 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -28,6 +28,16 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_KUNIT_TEST
+ bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y && BINFMT_ELF=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ELF loader KUnit tests, which try to gather
+ prior bug fixes into a regression test collection. This is really
+ only needed for debugging. Note that with CONFIG_COMPAT=y, the
+ compat_binfmt_elf KUnit test is also created.
+
config COMPAT_BINFMT_ELF
def_bool y
depends on COMPAT && BINFMT_ELF
@@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_BINFMT_ELF_EXTRA_PHDRS
+ bool
+
config ARCH_HAVE_ELF_PROT
bool
diff --git a/fs/Makefile b/fs/Makefile
index 84c5e4cdfee5..208a74e0b00e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -6,6 +6,8 @@
# Rewritten to use lists instead of if-statements.
#
+obj-$(CONFIG_SYSCTL) += sysctls.o
+
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o dcache.o inode.o \
@@ -94,7 +96,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
-obj-$(CONFIG_UNICODE) += unicode/
+obj-y += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/
obj-$(CONFIG_CIFS) += cifs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index adbb3a1edcbf..561bc748c04a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,7 +73,8 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = adfs_readpage,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
@@ -355,7 +356,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct super_block *sb = inode->i_sb;
struct object_info obj;
- int ret;
obj.indaddr = ADFS_I(inode)->indaddr;
obj.name_len = 0;
@@ -365,6 +365,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
obj.attr = ADFS_I(inode)->attr;
obj.size = inode->i_size;
- ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
- return ret;
+ return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdbd26e571ed..e8bfc38239cd 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 75ebd2b576ca..b3f81d84ff4c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -453,7 +453,8 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
@@ -834,7 +835,8 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c6c2a513ec92..4c5f30a83336 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
{
struct affs_inode_info *i;
- i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL);
if (!i)
return NULL;
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ size = bdev_nr_sectors(sb->s_bdev);
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 75c4e4043d1d..e8956b65d7ff 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -3,10 +3,7 @@
# Makefile for Red Hat Linux AFS client.
#
-afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
-
kafs-y := \
- $(afs-cache-y) \
addr_list.o \
callback.o \
cell.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
deleted file mode 100644
index 037af93e3aba..000000000000
--- a/fs/afs/cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* AFS caching stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/sched.h>
-#include "internal.h"
-
-static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size);
-
-struct fscache_netfs afs_cache_netfs = {
- .name = "afs",
- .version = 2,
-};
-
-struct fscache_cookie_def afs_cell_cache_index_def = {
- .name = "AFS.cell",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-struct fscache_cookie_def afs_volume_cache_index_def = {
- .name = "AFS.volume",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-struct fscache_cookie_def afs_vnode_cache_index_def = {
- .name = "AFS.vnode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = afs_vnode_cache_check_aux,
-};
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size)
-{
- struct afs_vnode *vnode = cookie_netfs_data;
- struct afs_vnode_cache_aux aux;
-
- _enter("{%llx,%x,%llx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, buflen);
-
- memcpy(&aux, buffer, sizeof(aux));
-
- /* check the size of the data is what we're expecting */
- if (buflen != sizeof(aux)) {
- _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux));
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- if (vnode->status.data_version != aux.data_version) {
- _leave(" = OBSOLETE [vers %llx != %llx]",
- aux.data_version, vnode->status.data_version);
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- _leave(" = SUCCESS");
- return FSCACHE_CHECKAUX_OKAY;
-}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index d88407fb9bc0..07ad744eef77 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -680,13 +680,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
return ret;
}
-#ifdef CONFIG_AFS_FSCACHE
- cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
- &afs_cell_cache_index_def,
- cell->name, strlen(cell->name),
- NULL, 0,
- cell, 0, true);
-#endif
ret = afs_proc_cell_setup(cell);
if (ret < 0)
return ret;
@@ -723,11 +716,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(cell->cache, NULL, false);
- cell->cache = NULL;
-#endif
-
_leave("");
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 4579bbda4634..932e61e28e5d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -42,10 +42,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
-static int afs_dir_set_page_dirty(struct page *page)
+static bool afs_dir_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
BUG(); /* This should never happen. */
}
@@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = {
};
const struct address_space_operations afs_dir_aops = {
- .set_page_dirty = afs_dir_set_page_dirty,
+ .dirty_folio = afs_dir_dirty_folio,
.releasepage = afs_dir_releasepage,
- .invalidatepage = afs_dir_invalidatepage,
+ .invalidate_folio = afs_dir_invalidate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -103,13 +104,13 @@ struct afs_lookup_cookie {
};
/*
- * Drop the refs that we're holding on the pages we were reading into. We've
+ * Drop the refs that we're holding on the folios we were reading into. We've
* got refs on the first nr_pages pages.
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
XA_STATE(xas, &mapping->i_pages, 0);
@@ -118,65 +119,56 @@ static void afs_dir_read_cleanup(struct afs_read *req)
return;
rcu_read_lock();
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(xa_is_value(page));
- BUG_ON(PageCompound(page));
- ASSERTCMP(page->mapping, ==, mapping);
+ BUG_ON(xa_is_value(folio));
+ ASSERTCMP(folio_file_mapping(folio), ==, mapping);
- put_page(page);
+ folio_put(folio);
}
rcu_read_unlock();
}
/*
- * check that a directory page is valid
+ * check that a directory folio is valid
*/
-static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
- loff_t i_size)
+static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
+ loff_t i_size)
{
- struct afs_xdr_dir_page *dbuf;
- loff_t latter, off;
- int tmp, qty;
+ union afs_xdr_dir_block *block;
+ size_t offset, size;
+ loff_t pos;
- /* Determine how many magic numbers there should be in this page, but
+ /* Determine how many magic numbers there should be in this folio, but
* we must take care because the directory may change size under us.
*/
- off = page_offset(page);
- if (i_size <= off)
+ pos = folio_pos(folio);
+ if (i_size <= pos)
goto checked;
- latter = i_size - off;
- if (latter >= PAGE_SIZE)
- qty = PAGE_SIZE;
- else
- qty = latter;
- qty /= sizeof(union afs_xdr_dir_block);
-
- /* check them */
- dbuf = kmap_atomic(page);
- for (tmp = 0; tmp < qty; tmp++) {
- if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
- printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino, tmp, qty,
- ntohs(dbuf->blocks[tmp].hdr.magic));
- trace_afs_dir_check_failed(dvnode, off, i_size);
- kunmap(page);
+ size = min_t(loff_t, folio_size(folio), i_size - pos);
+ for (offset = 0; offset < size; offset += sizeof(*block)) {
+ block = kmap_local_folio(folio, offset);
+ if (block->hdr.magic != AFS_DIR_MAGIC) {
+ printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
+ __func__, dvnode->vfs_inode.i_ino,
+ pos, offset, size, ntohs(block->hdr.magic));
+ trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
+ kunmap_local(block);
trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
goto error;
}
/* Make sure each block is NUL terminated so we can reasonably
- * use string functions on it. The filenames in the page
+ * use string functions on it. The filenames in the folio
* *should* be NUL-terminated anyway.
*/
- ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
- }
-
- kunmap_atomic(dbuf);
+ ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
+ kunmap_local(block);
+ }
checked:
afs_stat_v(dvnode, n_read_dir);
return true;
@@ -190,11 +182,11 @@ error:
*/
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct afs_xdr_dir_page *dbuf;
+ union afs_xdr_dir_block *block;
struct address_space *mapping = dvnode->vfs_inode.i_mapping;
- struct page *page;
- unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
+ size_t offset, size;
XA_STATE(xas, &mapping->i_pages, 0);
@@ -205,30 +197,28 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
req->pos, req->nr_pages,
req->iter->iov_offset, iov_iter_count(req->iter));
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(PageCompound(page));
- BUG_ON(page->mapping != mapping);
-
- dbuf = kmap_atomic(page);
- for (i = 0; i < qty; i++) {
- union afs_xdr_dir_block *block = &dbuf->blocks[i];
+ BUG_ON(folio_file_mapping(folio) != mapping);
- pr_warn("[%02lx] %32phN\n", page->index * qty + i, block);
+ size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
+ for (offset = 0; offset < size; offset += sizeof(*block)) {
+ block = kmap_local_folio(folio, offset);
+ pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ kunmap_local(block);
}
- kunmap_atomic(dbuf);
}
}
/*
- * Check all the pages in a directory. All the pages are held pinned.
+ * Check all the blocks in a directory. All the folios are held pinned.
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
struct address_space *mapping = dvnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -238,14 +228,13 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
return 0;
rcu_read_lock();
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(PageCompound(page));
- BUG_ON(page->mapping != mapping);
+ BUG_ON(folio_file_mapping(folio) != mapping);
- if (!afs_dir_check_page(dvnode, page, req->file_size)) {
+ if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
ret = -EIO;
break;
@@ -274,15 +263,16 @@ static int afs_dir_open(struct inode *inode, struct file *file)
/*
* Read the directory into the pagecache in one go, scrubbing the previous
- * contents. The list of pages is returned, pinning them so that they don't
+ * contents. The list of folios is returned, pinning them so that they don't
* get reclaimed during the iteration.
*/
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
struct afs_read *req;
loff_t i_size;
- int nr_pages, i, n;
+ int nr_pages, i;
int ret;
_enter("");
@@ -320,43 +310,30 @@ expand:
req->iter = &req->def_iter;
/* Fill in any gaps that we might find where the memory reclaimer has
- * been at work and pin all the pages. If there are any gaps, we will
+ * been at work and pin all the folios. If there are any gaps, we will
* need to reread the entire directory contents.
*/
i = req->nr_pages;
while (i < nr_pages) {
- struct page *pages[8], *page;
-
- n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i,
- min_t(unsigned int, nr_pages - i,
- ARRAY_SIZE(pages)),
- pages);
- _debug("find %u at %u/%u", n, i, nr_pages);
-
- if (n == 0) {
- gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask;
+ struct folio *folio;
+ folio = filemap_get_folio(mapping, i);
+ if (!folio) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
ret = -ENOMEM;
- page = __page_cache_alloc(gfp);
- if (!page)
- goto error;
- ret = add_to_page_cache_lru(page,
- dvnode->vfs_inode.i_mapping,
- i, gfp);
- if (ret < 0)
+ folio = __filemap_get_folio(mapping,
+ i, FGP_LOCK | FGP_CREAT,
+ mapping->gfp_mask);
+ if (!folio)
goto error;
-
- attach_page_private(page, (void *)1);
- unlock_page(page);
- req->nr_pages++;
- i++;
- } else {
- req->nr_pages += n;
- i += n;
+ folio_attach_private(folio, (void *)1);
+ folio_unlock(folio);
}
+
+ req->nr_pages += folio_nr_pages(folio);
+ i += folio_nr_pages(folio);
}
/* If we're going to reload, we need to lock all the pages to prevent
@@ -424,7 +401,7 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
size_t nlen;
int tmp;
- _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
+ _enter("%llx,%x", ctx->pos, blkoff);
curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent);
@@ -513,12 +490,10 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
struct key *key, afs_dataversion_t *_dir_version)
{
struct afs_vnode *dvnode = AFS_FS_I(dir);
- struct afs_xdr_dir_page *dbuf;
union afs_xdr_dir_block *dblock;
struct afs_read *req;
- struct page *page;
- unsigned blkoff, limit;
- void __rcu **slot;
+ struct folio *folio;
+ unsigned offset, size;
int ret;
_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
@@ -540,43 +515,30 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
/* walk through the blocks in sequence */
ret = 0;
while (ctx->pos < req->actual_len) {
- blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1);
-
- /* Fetch the appropriate page from the directory and re-add it
+ /* Fetch the appropriate folio from the directory and re-add it
* to the LRU. We have all the pages pinned with an extra ref.
*/
- rcu_read_lock();
- page = NULL;
- slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages,
- blkoff / PAGE_SIZE);
- if (slot)
- page = radix_tree_deref_slot(slot);
- rcu_read_unlock();
- if (!page) {
+ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
+ FGP_ACCESSED, 0);
+ if (!folio) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
- mark_page_accessed(page);
-
- limit = blkoff & ~(PAGE_SIZE - 1);
- dbuf = kmap(page);
+ offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio);
+ size = min_t(loff_t, folio_size(folio),
+ req->actual_len - folio_file_pos(folio));
- /* deal with the individual blocks stashed on this page */
do {
- dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
- sizeof(union afs_xdr_dir_block)];
- ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff);
- if (ret != 1) {
- kunmap(page);
+ dblock = kmap_local_folio(folio, offset);
+ ret = afs_dir_iterate_block(dvnode, ctx, dblock,
+ folio_file_pos(folio) + offset);
+ kunmap_local(dblock);
+ if (ret != 1)
goto out;
- }
-
- blkoff += sizeof(union afs_xdr_dir_block);
- } while (ctx->pos < dir->i_size && blkoff < limit);
+ } while (offset += sizeof(*dblock), offset < size);
- kunmap(page);
ret = 0;
}
@@ -2037,42 +1999,41 @@ error:
}
/*
- * Release a directory page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
+ * Release a directory folio and clean up its private state if it's not busy
+ * - return true if the folio can now be released, false if not
*/
-static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
+static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
{
- struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+ struct folio *folio = page_folio(subpage);
+ struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
- detach_page_private(page);
+ folio_detach_private(folio);
/* The directory will need reloading. */
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_relpg);
- return 1;
+ return true;
}
/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
+ * Invalidate part or all of a folio.
*/
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{%lu},%u,%u", page->index, offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
/* The directory will need reloading. */
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == thp_size(page))
- detach_page_private(page);
+ /* we clean up only if the entire folio is being invalidated */
+ if (offset == 0 && length == folio_size(folio))
+ folio_detach_private(folio);
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 540b9fc96824..d98e109ecee9 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -105,6 +105,25 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
}
/*
+ * Get a new directory folio.
+ */
+static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
+{
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping->gfp_mask);
+ if (!folio)
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ else if (folio && !folio_test_private(folio))
+ folio_attach_private(folio, (void *)1);
+
+ return folio;
+}
+
+/*
* Scan a directory block looking for a dirent of the right name.
*/
static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
@@ -188,13 +207,11 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block;
- struct afs_xdr_dir_page *meta_page, *dir_page;
union afs_xdr_dirent *de;
- struct page *page0, *page;
+ struct folio *folio0, *folio;
unsigned int need_slots, nr_blocks, b;
pgoff_t index;
loff_t i_size;
- gfp_t gfp;
int slot;
_enter(",,{%d,%s},", name->len, name->name);
@@ -206,10 +223,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
return;
}
- gfp = vnode->vfs_inode.i_mapping->gfp_mask;
- page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp);
- if (!page0) {
- clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ folio0 = afs_dir_get_folio(vnode, 0);
+ if (!folio0) {
_leave(" [fgp]");
return;
}
@@ -217,42 +232,35 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
/* Work out how many slots we're going to need. */
need_slots = afs_dir_calc_slots(name->len);
- meta_page = kmap(page0);
- meta = &meta_page->blocks[0];
+ meta = kmap_local_folio(folio0, 0);
if (i_size == 0)
goto new_directory;
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
- /* Find a block that has sufficient slots available. Each VM page
+ /* Find a block that has sufficient slots available. Each folio
* contains two or more directory blocks.
*/
for (b = 0; b < nr_blocks + 1; b++) {
- /* If the directory extended into a new page, then we need to
- * tack a new page on the end.
+ /* If the directory extended into a new folio, then we need to
+ * tack a new folio on the end.
*/
index = b / AFS_DIR_BLOCKS_PER_PAGE;
- if (index == 0) {
- page = page0;
- dir_page = meta_page;
- } else {
- if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
- goto error;
- gfp = vnode->vfs_inode.i_mapping->gfp_mask;
- page = find_or_create_page(vnode->vfs_inode.i_mapping,
- index, gfp);
- if (!page)
+ if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
+ goto error;
+ if (index >= folio_nr_pages(folio0)) {
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
goto error;
- if (!PagePrivate(page))
- attach_page_private(page, (void *)1);
- dir_page = kmap(page);
+ } else {
+ folio = folio0;
}
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated;
- block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
-
_debug("block %u: %2u %3u %u",
b,
(b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99,
@@ -266,7 +274,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
}
- /* Only lower dir pages have a counter in the header. */
+ /* Only lower dir blocks have a counter in the header. */
if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
meta->meta.alloc_ctrs[b] >= need_slots) {
/* We need to try and find one or more consecutive
@@ -279,10 +287,10 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
}
}
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -298,8 +306,8 @@ new_directory:
i_size = AFS_DIR_BLOCK_SIZE;
afs_set_i_size(vnode, i_size);
slot = AFS_DIR_RESV_BLOCKS0;
- page = page0;
- block = meta;
+ folio = folio0;
+ block = kmap_local_folio(folio, 0);
nr_blocks = 1;
b = 0;
@@ -318,10 +326,10 @@ found_space:
/* Adjust the bitmap. */
afs_set_contig_bits(block, slot, need_slots);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
/* Adjust the allocation counter. */
@@ -333,18 +341,19 @@ found_space:
_debug("Insert %s in %u[%u]", name->name, b, slot);
out_unmap:
- unlock_page(page0);
- kunmap(page0);
- put_page(page0);
+ kunmap_local(meta);
+ folio_unlock(folio0);
+ folio_put(folio0);
_leave("");
return;
invalidated:
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
- if (page != page0) {
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
goto out_unmap;
@@ -364,10 +373,9 @@ error:
void afs_edit_dir_remove(struct afs_vnode *vnode,
struct qstr *name, enum afs_edit_dir_reason why)
{
- struct afs_xdr_dir_page *meta_page, *dir_page;
union afs_xdr_dir_block *meta, *block;
union afs_xdr_dirent *de;
- struct page *page0, *page;
+ struct folio *folio0, *folio;
unsigned int need_slots, nr_blocks, b;
pgoff_t index;
loff_t i_size;
@@ -384,9 +392,8 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
}
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
- page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0);
- if (!page0) {
- clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ folio0 = afs_dir_get_folio(vnode, 0);
+ if (!folio0) {
_leave(" [fgp]");
return;
}
@@ -394,30 +401,27 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
/* Work out how many slots we're going to discard. */
need_slots = afs_dir_calc_slots(name->len);
- meta_page = kmap(page0);
- meta = &meta_page->blocks[0];
+ meta = kmap_local_folio(folio0, 0);
- /* Find a page that has sufficient slots available. Each VM page
+ /* Find a block that has sufficient slots available. Each folio
* contains two or more directory blocks.
*/
for (b = 0; b < nr_blocks; b++) {
index = b / AFS_DIR_BLOCKS_PER_PAGE;
- if (index != 0) {
- page = find_lock_page(vnode->vfs_inode.i_mapping, index);
- if (!page)
+ if (index >= folio_nr_pages(folio0)) {
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
goto error;
- dir_page = kmap(page);
} else {
- page = page0;
- dir_page = meta_page;
+ folio = folio0;
}
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated;
- block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
-
if (b > AFS_DIR_BLOCKS_WITH_CTR ||
meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
slot = afs_dir_scan_block(block, name, b);
@@ -425,10 +429,10 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
goto found_dirent;
}
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -449,10 +453,10 @@ found_dirent:
/* Adjust the bitmap. */
afs_clear_contig_bits(block, slot, need_slots);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
/* Adjust the allocation counter. */
@@ -464,9 +468,9 @@ found_dirent:
_debug("Remove %s from %u[%u]", name->name, b, slot);
out_unmap:
- unlock_page(page0);
- kunmap(page0);
- put_page(page0);
+ kunmap_local(meta);
+ folio_unlock(folio0);
+ folio_put(folio0);
_leave("");
return;
@@ -474,10 +478,10 @@ invalidated:
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
goto out_unmap;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index db832cc931c8..f120bcb8bf73 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
+ netfs_i_context_init(inode, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index e6c447ae91f3..26292a110a8f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -14,16 +14,16 @@
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static int afs_symlink_readpage(struct file *file, struct page *page);
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_readahead(struct readahead_control *ractl);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static void afs_vm_open(struct vm_area_struct *area);
static void afs_vm_close(struct vm_area_struct *area);
@@ -49,19 +49,25 @@ const struct inode_operations afs_file_inode_operations = {
.permission = afs_permission,
};
-const struct address_space_operations afs_fs_aops = {
- .readpage = afs_readpage,
- .readahead = afs_readahead,
- .set_page_dirty = afs_set_page_dirty,
- .launder_page = afs_launder_page,
+const struct address_space_operations afs_file_aops = {
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
+ .dirty_folio = afs_dirty_folio,
+ .launder_folio = afs_launder_folio,
.releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
.writepage = afs_writepage,
.writepages = afs_writepages,
};
+const struct address_space_operations afs_symlink_aops = {
+ .readpage = afs_symlink_readpage,
+ .releasepage = afs_releasepage,
+ .invalidate_folio = afs_invalidate_folio,
+};
+
static const struct vm_operations_struct afs_vm_ops = {
.open = afs_vm_open,
.close = afs_vm_close,
@@ -151,7 +157,9 @@ int afs_open(struct inode *inode, struct file *file)
if (file->f_flags & O_TRUNC)
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-
+
+ fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE);
+
file->private_data = af;
_leave(" = 0");
return 0;
@@ -170,8 +178,10 @@ error:
*/
int afs_release(struct inode *inode, struct file *file)
{
+ struct afs_vnode_cache_aux aux;
struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = file->private_data;
+ loff_t i_size;
int ret = 0;
_enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
@@ -182,6 +192,15 @@ int afs_release(struct inode *inode, struct file *file)
file->private_data = NULL;
if (af->wb)
afs_put_wb_key(af->wb);
+
+ if ((file->f_mode & FMODE_WRITE)) {
+ i_size = i_size_read(&vnode->vfs_inode);
+ afs_set_cache_aux(vnode, &aux);
+ fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);
+ } else {
+ fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL);
+ }
+
key_put(af->key);
kfree(af);
afs_prune_wb_keys(vnode);
@@ -219,7 +238,7 @@ void afs_put_read(struct afs_read *req)
static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
- struct netfs_read_subrequest *subreq = req->subreq;
+ struct netfs_io_subrequest *subreq = req->subreq;
int error = op->error;
if (error == -ECONNABORTED)
@@ -289,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
return afs_do_sync_operation(op);
}
-static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
{
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
@@ -313,49 +332,51 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct page *page)
+static int afs_symlink_readpage(struct file *file, struct page *page)
{
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
struct afs_read *fsreq;
+ struct folio *folio = page_folio(page);
int ret;
fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
return -ENOMEM;
- fsreq->pos = page->index * PAGE_SIZE;
- fsreq->len = PAGE_SIZE;
+ fsreq->pos = folio_pos(folio);
+ fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
- page_endio(page, false, ret);
+ if (ret == 0)
+ SetPageUptodate(page);
+ unlock_page(page);
return ret;
}
-static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
rreq->netfs_priv = key_get(afs_file_key(file));
+ return 0;
}
-static bool afs_is_cache_enabled(struct inode *inode)
-{
- struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode));
-
- return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
-}
-
-static int afs_begin_cache_operation(struct netfs_read_request *rreq)
+static int afs_begin_cache_operation(struct netfs_io_request *rreq)
{
+#ifdef CONFIG_AFS_FSCACHE
struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
- return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode));
+ return fscache_begin_read_operation(&rreq->cache_resources,
+ afs_vnode_cache(vnode));
+#else
+ return -ENOBUFS;
+#endif
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct page *page, void **_fsdata)
+ struct folio *folio, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
@@ -367,55 +388,47 @@ static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
key_put(netfs_priv);
}
-const struct netfs_read_request_ops afs_req_ops = {
- .init_rreq = afs_init_rreq,
- .is_cache_enabled = afs_is_cache_enabled,
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
- .issue_op = afs_req_issue_op,
+ .issue_read = afs_issue_read,
.cleanup = afs_priv_cleanup,
};
-static int afs_readpage(struct file *file, struct page *page)
+int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (!file)
- return afs_symlink_readpage(page);
-
- return netfs_readpage(file, page, &afs_req_ops, NULL);
-}
-
-static void afs_readahead(struct readahead_control *ractl)
-{
- netfs_readahead(ractl, &afs_req_ops, NULL);
+ fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
+ return 0;
}
/*
* Adjust the dirty region of the page on truncation or full invalidation,
* getting rid of the markers altogether if the region is entirely invalidated.
*/
-static void afs_invalidate_dirty(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_dirty(struct folio *folio, size_t offset,
+ size_t length)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
unsigned long priv;
unsigned int f, t, end = offset + length;
- priv = page_private(page);
+ priv = (unsigned long)folio_get_private(folio);
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == thp_size(page))
+ if (offset == 0 && length == folio_size(folio))
goto full_invalidate;
/* If the page was dirtied by page_mkwrite(), the PTE stays writable
* and we don't get another notification to tell us to expand it
* again.
*/
- if (afs_is_page_dirty_mmapped(priv))
+ if (afs_is_folio_dirty_mmapped(priv))
return;
/* We may need to shorten the dirty region */
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -433,17 +446,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
if (f == t)
goto undirty;
- priv = afs_page_dirty(page, f, t);
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
+ priv = afs_folio_dirty(folio, f, t);
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
return;
undirty:
- trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
- clear_page_dirty_for_io(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
+ folio_clear_dirty_for_io(folio);
full_invalidate:
- trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
- detach_page_private(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
+ folio_detach_private(folio);
}
/*
@@ -451,17 +464,17 @@ full_invalidate:
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- _enter("{%lu},%u,%u", page->index, offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (PagePrivate(page))
- afs_invalidate_dirty(page, offset, length);
+ if (folio_get_private(folio))
+ afs_invalidate_dirty(folio, offset, length);
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
_leave("");
}
@@ -469,32 +482,34 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
* release a page and clean up its private state if it's not busy
* - return true if the page can now be released, false if not
*/
-static int afs_releasepage(struct page *page, gfp_t gfp_flags)
+static int afs_releasepage(struct page *page, gfp_t gfp)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct folio *folio = page_folio(page);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
- gfp_flags);
+ vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
+ gfp);
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page)) {
- if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS))
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
+ fscache_note_page_release(afs_vnode_cache(vnode));
#endif
- if (PagePrivate(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
- detach_page_private(page);
+ if (folio_test_private(folio)) {
+ trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
+ folio_detach_private(folio);
}
- /* indicate that the page can be released */
+ /* Indicate that the folio can be released */
_leave(" = T");
- return 1;
+ return true;
}
static void afs_add_open_mmap(struct afs_vnode *vnode)
@@ -502,8 +517,9 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
down_write(&vnode->volume->cell->fs_open_mmaps_lock);
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ if (list_empty(&vnode->cb_mmap_link))
+ list_add_tail(&vnode->cb_mmap_link,
+ &vnode->volume->cell->fs_open_mmaps);
up_write(&vnode->volume->cell->fs_open_mmaps_lock);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 8fcffea2daf5..30b066299d39 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
}
/*
+ * Set parameters for the netfs library
+ */
+static void afs_set_netfs_context(struct afs_vnode *vnode)
+{
+ netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops);
+}
+
+/*
* Initialise an inode from the vnode status.
*/
static int afs_inode_init_from_status(struct afs_operation *op,
@@ -95,7 +103,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFREG | (status->mode & S_IALLUGO);
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_file_aops;
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
@@ -113,11 +121,11 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFDIR | 0555;
inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
} else {
inode->i_mode = S_IFLNK | status->mode;
inode->i_op = &afs_symlink_inode_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
}
inode_nohighmem(inode);
break;
@@ -128,6 +136,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
}
afs_set_i_size(vnode, status->size);
+ afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
@@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
+ vnode->netfs_ctx.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -413,28 +423,31 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
struct {
- u32 vnode_id;
- u32 unique;
- u32 vnode_id_ext[2]; /* Allow for a 96-bit key */
+ __be32 vnode_id;
+ __be32 unique;
+ __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */
} __packed key;
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->cache = NULL;
+ vnode->netfs_ctx.cache = NULL;
return;
}
- key.vnode_id = vnode->fid.vnode;
- key.unique = vnode->fid.unique;
- key.vnode_id_ext[0] = vnode->fid.vnode >> 32;
- key.vnode_id_ext[1] = vnode->fid.vnode_hi;
- aux.data_version = vnode->status.data_version;
-
- vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
- &afs_vnode_cache_index_def,
- &key, sizeof(key),
- &aux, sizeof(aux),
- vnode, vnode->status.size, true);
+ key.vnode_id = htonl(vnode->fid.vnode);
+ key.unique = htonl(vnode->fid.unique);
+ key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32);
+ key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi);
+ afs_set_cache_aux(vnode, &aux);
+
+ afs_vnode_set_cache(vnode,
+ fscache_acquire_cookie(
+ vnode->volume->cache,
+ vnode->status.type == AFS_FTYPE_FILE ?
+ 0 : FSCACHE_ADV_SINGLE_CHUNK,
+ &key, sizeof(key),
+ &aux, sizeof(aux),
+ vnode->status.size));
#endif
}
@@ -527,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
vnode = AFS_FS_I(inode);
vnode->cb_v_break = as->volume->cb_v_break,
+ afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
if (IS_ERR(op)) {
@@ -563,9 +577,7 @@ static void afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_invalidate(vnode->cache);
-#endif
+ afs_invalidate_cache(vnode, 0);
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
@@ -728,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (!(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
@@ -762,9 +786,8 @@ int afs_drop_inode(struct inode *inode)
*/
void afs_evict_inode(struct inode *inode)
{
- struct afs_vnode *vnode;
-
- vnode = AFS_FS_I(inode);
+ struct afs_vnode_cache_aux aux;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
_enter("{%llx:%llu.%d}",
vnode->fid.vid,
@@ -776,6 +799,9 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
truncate_inode_pages_final(&inode->i_data);
+
+ afs_set_cache_aux(vnode, &aux);
+ fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -785,16 +811,8 @@ void afs_evict_inode(struct inode *inode)
afs_put_wb_key(wbk);
}
-#ifdef CONFIG_AFS_FSCACHE
- {
- struct afs_vnode_cache_aux aux;
-
- aux.data_version = vnode->status.data_version;
- fscache_relinquish_cookie(vnode->cache, &aux,
- test_bit(AFS_VNODE_DELETED, &vnode->flags));
- vnode->cache = NULL;
- }
-#endif
+ fscache_relinquish_cookie(afs_vnode_cache(vnode),
+ test_bit(AFS_VNODE_DELETED, &vnode->flags));
afs_prune_wb_keys(vnode);
afs_put_permits(rcu_access_pointer(vnode->permit_cache));
@@ -833,6 +851,9 @@ static void afs_setattr_edit_file(struct afs_operation *op)
if (size < i_size)
truncate_pagecache(inode, size);
+ if (size != i_size)
+ fscache_resize_cookie(afs_vnode_cache(vp->vnode),
+ vp->scb.status.size);
}
}
@@ -849,40 +870,67 @@ static const struct afs_operation_ops afs_setattr_operation = {
int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
+ const unsigned int supported =
+ ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+ ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+ struct inode *inode = &vnode->vfs_inode;
+ loff_t i_size;
int ret;
_enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
- if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
- ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET |
- ATTR_TOUCH))) {
+ if (!(attr->ia_valid & supported)) {
_leave(" = 0 [unsupported]");
return 0;
}
+ i_size = i_size_read(inode);
if (attr->ia_valid & ATTR_SIZE) {
- if (!S_ISREG(vnode->vfs_inode.i_mode))
+ if (!S_ISREG(inode->i_mode))
return -EISDIR;
- ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size);
+ ret = inode_newsize_ok(inode, attr->ia_size);
if (ret)
return ret;
- if (attr->ia_size == i_size_read(&vnode->vfs_inode))
+ if (attr->ia_size == i_size)
attr->ia_valid &= ~ATTR_SIZE;
}
- /* flush any dirty data outstanding on a regular file */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ fscache_use_cookie(afs_vnode_cache(vnode), true);
/* Prevent any new writebacks from starting whilst we do this. */
down_write(&vnode->validate_lock);
+ if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) {
+ loff_t size = attr->ia_size;
+
+ /* Wait for any outstanding writes to the server to complete */
+ loff_t from = min(size, i_size);
+ loff_t to = max(size, i_size);
+ ret = filemap_fdatawait_range(inode->i_mapping, from, to);
+ if (ret < 0)
+ goto out_unlock;
+
+ /* Don't talk to the server if we're just shortening in-memory
+ * writes that haven't gone to the server yet.
+ */
+ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
+ attr->ia_size < i_size &&
+ attr->ia_size > vnode->status.size) {
+ truncate_pagecache(inode, attr->ia_size);
+ fscache_resize_cookie(afs_vnode_cache(vnode),
+ attr->ia_size);
+ i_size_write(inode, attr->ia_size);
+ ret = 0;
+ goto out_unlock;
+ }
+ }
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
@@ -907,6 +955,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
out_unlock:
up_write(&vnode->validate_lock);
+ fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0ad97a8fc0d4..7b7ef945dc78 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -14,7 +14,6 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
-#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
#include <linux/backing-dev.h>
#include <linux/uuid.h>
@@ -208,7 +207,7 @@ struct afs_read {
loff_t file_size; /* File size returned by server */
struct key *key; /* The key to use to reissue the read */
struct afs_vnode *vnode; /* The file being read into. */
- struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */
+ struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
unsigned int call_debug_id;
@@ -364,9 +363,6 @@ struct afs_cell {
struct key *anonymous_key; /* anonymous user key for this cell */
struct work_struct manager; /* Manager for init/deinit/dns */
struct hlist_node proc_link; /* /proc cell list link */
-#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
-#endif
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
atomic_t ref; /* Struct refcount */
@@ -590,7 +586,7 @@ struct afs_volume {
#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
+ struct fscache_volume *cache; /* Caching cookie */
#endif
struct afs_server_list __rcu *servers; /* List of servers on which volume resides */
rwlock_t servers_lock; /* Lock for ->servers */
@@ -623,15 +619,16 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct inode vfs_inode; /* the VFS's inode record */
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
-#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
-#endif
struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
struct mutex io_lock; /* Lock for serialising I/O on this mutex */
struct rw_semaphore validate_lock; /* lock for validating this vnode */
@@ -678,12 +675,20 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return vnode->cache;
+ return netfs_i_cookie(&vnode->vfs_inode);
#else
return NULL;
#endif
}
+static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
+ struct fscache_cookie *cookie)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ vnode->netfs_ctx.cache = cookie;
+#endif
+}
+
/*
* cached security record for one user's attempt to access a vnode
*/
@@ -872,63 +877,78 @@ struct afs_operation {
* Cache auxiliary data.
*/
struct afs_vnode_cache_aux {
- u64 data_version;
+ __be64 data_version;
} __packed;
+static inline void afs_set_cache_aux(struct afs_vnode *vnode,
+ struct afs_vnode_cache_aux *aux)
+{
+ aux->data_version = cpu_to_be64(vnode->status.data_version);
+}
+
+static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags)
+{
+ struct afs_vnode_cache_aux aux;
+
+ afs_set_cache_aux(vnode, &aux);
+ fscache_invalidate(afs_vnode_cache(vnode), &aux,
+ i_size_read(&vnode->vfs_inode), flags);
+}
+
/*
- * We use page->private to hold the amount of the page that we've written to,
+ * We use folio->private to hold the amount of the folio that we've written to,
* splitting the field into two parts. However, we need to represent a range
- * 0...PAGE_SIZE, so we reduce the resolution if the size of the page
+ * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
* exceeds what we can encode.
*/
#ifdef CONFIG_64BIT
-#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL
-#define __AFS_PAGE_PRIV_SHIFT 32
-#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL
+#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
+#define __AFS_FOLIO_PRIV_SHIFT 32
+#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
#else
-#define __AFS_PAGE_PRIV_MASK 0x7fffUL
-#define __AFS_PAGE_PRIV_SHIFT 16
-#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL
+#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
+#define __AFS_FOLIO_PRIV_SHIFT 16
+#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
#endif
-static inline unsigned int afs_page_dirty_resolution(struct page *page)
+static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
{
- int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+ int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
}
-static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
+static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
{
- unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
+ unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
/* The lower bound is inclusive */
- return x << afs_page_dirty_resolution(page);
+ return x << afs_folio_dirty_resolution(folio);
}
-static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
+static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
{
- unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK;
+ unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
/* The upper bound is immediately beyond the region */
- return (x + 1) << afs_page_dirty_resolution(page);
+ return (x + 1) << afs_folio_dirty_resolution(folio);
}
-static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to)
+static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
{
- unsigned int res = afs_page_dirty_resolution(page);
+ unsigned int res = afs_folio_dirty_resolution(folio);
from >>= res;
to = (to - 1) >> res;
- return (to << __AFS_PAGE_PRIV_SHIFT) | from;
+ return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
}
-static inline unsigned long afs_page_dirty_mmapped(unsigned long priv)
+static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
{
- return priv | __AFS_PAGE_PRIV_MMAPPED;
+ return priv | __AFS_FOLIO_PRIV_MMAPPED;
}
-static inline bool afs_is_page_dirty_mmapped(unsigned long priv)
+static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
{
- return priv & __AFS_PAGE_PRIV_MMAPPED;
+ return priv & __AFS_FOLIO_PRIV_MMAPPED;
}
#include <trace/events/afs.h>
@@ -962,13 +982,6 @@ extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
*/
#ifdef CONFIG_AFS_FSCACHE
extern struct fscache_netfs afs_cache_netfs;
-extern struct fscache_cookie_def afs_cell_cache_index_def;
-extern struct fscache_cookie_def afs_volume_cache_index_def;
-extern struct fscache_cookie_def afs_vnode_cache_index_def;
-#else
-#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
-#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
-#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
#endif
/*
@@ -1055,10 +1068,11 @@ extern void afs_dynroot_depopulate(struct super_block *);
/*
* file.c
*/
-extern const struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_file_aops;
+extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
-extern const struct netfs_read_request_ops afs_req_ops;
+extern const struct netfs_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *);
@@ -1067,6 +1081,7 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
+extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1505,7 +1520,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
* volume.c
*/
extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
-extern void afs_activate_volume(struct afs_volume *);
+extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
@@ -1514,7 +1529,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-extern int afs_set_page_dirty(struct page *);
+#ifdef CONFIG_AFS_FSCACHE
+bool afs_dirty_folio(struct address_space *, struct folio *);
+#else
+#define afs_dirty_folio filemap_dirty_folio
+#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
@@ -1527,7 +1546,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-extern int afs_launder_page(struct page *);
+int afs_launder_folio(struct folio *);
/*
* xattr.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 179004b15566..eae288c8d40a 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -186,13 +186,6 @@ static int __init afs_init(void)
if (!afs_lock_manager)
goto error_lockmgr;
-#ifdef CONFIG_AFS_FSCACHE
- /* we want to be able to cache */
- ret = fscache_register_netfs(&afs_cache_netfs);
- if (ret < 0)
- goto error_cache;
-#endif
-
ret = register_pernet_device(&afs_net_ops);
if (ret < 0)
goto error_net;
@@ -215,10 +208,6 @@ error_proc:
error_fs:
unregister_pernet_device(&afs_net_ops);
error_net:
-#ifdef CONFIG_AFS_FSCACHE
- fscache_unregister_netfs(&afs_cache_netfs);
-error_cache:
-#endif
destroy_workqueue(afs_lock_manager);
error_lockmgr:
destroy_workqueue(afs_async_calls);
@@ -245,9 +234,6 @@ static void __exit afs_exit(void)
proc_remove(afs_proc_symlink);
afs_fs_exit();
unregister_pernet_device(&afs_net_ops);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_unregister_netfs(&afs_cache_netfs);
-#endif
destroy_workqueue(afs_lock_manager);
destroy_workqueue(afs_async_calls);
destroy_workqueue(afs_wq);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 065a28bfa3f1..e1b863449296 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
__acquires(cell->proc_lock)
{
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
rcu_read_lock();
return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos);
@@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v,
loff_t *_pos)
{
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos);
}
@@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
{
struct afs_vl_seq_net_private *priv = m->private;
struct afs_vlserver_list *vllist;
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
loff_t pos = *_pos;
rcu_read_lock();
diff --git a/fs/afs/super.c b/fs/afs/super.c
index d110def8aa8e..1fea195b0b27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,6 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
+ .write_inode = afs_write_inode,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
@@ -667,6 +668,7 @@ static void afs_i_init_once(void *_vnode)
INIT_LIST_HEAD(&vnode->pending_locks);
INIT_LIST_HEAD(&vnode->granted_locks);
INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
+ INIT_LIST_HEAD(&vnode->cb_mmap_link);
seqlock_init(&vnode->cb_lock);
}
@@ -677,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
{
struct afs_vnode *vnode;
- vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+ vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL);
if (!vnode)
return NULL;
@@ -686,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
/* Reset anything that shouldn't leak from one inode to the next. */
memset(&vnode->fid, 0, sizeof(vnode->fid));
memset(&vnode->status, 0, sizeof(vnode->status));
+ afs_vnode_set_cache(vnode, NULL);
vnode->volume = NULL;
vnode->lock_key = NULL;
vnode->permit_cache = NULL;
-#ifdef CONFIG_AFS_FSCACHE
- vnode->cache = NULL;
-#endif
vnode->flags = 1 << AFS_VNODE_UNSET;
vnode->lock_state = AFS_VNODE_LOCK_NONE;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index f84194b791d3..94a3d247924b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -268,15 +268,30 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
/*
* Activate a volume.
*/
-void afs_activate_volume(struct afs_volume *volume)
+int afs_activate_volume(struct afs_volume *volume)
{
#ifdef CONFIG_AFS_FSCACHE
- volume->cache = fscache_acquire_cookie(volume->cell->cache,
- &afs_volume_cache_index_def,
- &volume->vid, sizeof(volume->vid),
- NULL, 0,
- volume, 0, true);
+ struct fscache_volume *vcookie;
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "afs,%s,%llx",
+ volume->cell->name, volume->vid);
+ if (!name)
+ return -ENOMEM;
+
+ vcookie = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(name);
+ return PTR_ERR(vcookie);
+ }
+ pr_err("AFS: Cache volume key already in use (%s)\n", name);
+ vcookie = NULL;
+ }
+ volume->cache = vcookie;
+ kfree(name);
#endif
+ return 0;
}
/*
@@ -287,7 +302,7 @@ void afs_deactivate_volume(struct afs_volume *volume)
_enter("%s", volume->name);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(volume->cache, NULL,
+ fscache_relinquish_volume(volume->cache, NULL,
test_bit(AFS_VOLUME_DELETED, &volume->flags));
volume->cache = NULL;
#endif
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f24370f5c774..4763132ca57e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,17 +12,31 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
-#include <linux/fscache.h>
#include "internal.h"
+static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
+ loff_t i_size, bool caching);
+
+#ifdef CONFIG_AFS_FSCACHE
/*
- * mark a page as having been made dirty and thus needing writeback
+ * Mark a page as having been made dirty and thus needing writeback. We also
+ * need to pin the cache object to write back to.
*/
-int afs_set_page_dirty(struct page *page)
+bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ return fscache_dirty_folio(mapping, folio,
+ afs_vnode_cache(AFS_FS_I(mapping->host)));
+}
+static void afs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void afs_folio_start_fscache(bool caching, struct folio *folio)
{
- _enter("");
- return __set_page_dirty_nobuffers(page);
}
+#endif
/*
* prepare to perform part of a write to a page
@@ -32,7 +46,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct page *page;
+ struct folio *folio;
unsigned long priv;
unsigned f, from;
unsigned t, to;
@@ -46,12 +60,11 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata,
- &afs_req_ops, NULL);
+ ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata);
if (ret < 0)
return ret;
- index = page->index;
+ index = folio_index(folio);
from = pos - index * PAGE_SIZE;
to = from + len;
@@ -59,14 +72,14 @@ try_again:
/* See if this page is already partially written in a way that we can
* merge the new write with.
*/
- if (PagePrivate(page)) {
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ if (folio_test_private(folio)) {
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
ASSERTCMP(f, <=, t);
- if (PageWriteback(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page);
+ if (folio_test_writeback(folio)) {
+ trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
goto flush_conflicting_write;
}
/* If the file is being filled locally, allow inter-write
@@ -78,7 +91,7 @@ try_again:
goto flush_conflicting_write;
}
- *_page = page;
+ *_page = &folio->page;
_leave(" = 0");
return 0;
@@ -87,17 +100,17 @@ try_again:
*/
flush_conflicting_write:
_debug("flush conflict");
- ret = write_one_page(page);
+ ret = folio_write_one(folio);
if (ret < 0)
goto error;
- ret = lock_page_killable(page);
+ ret = folio_lock_killable(folio);
if (ret < 0)
goto error;
goto try_again;
error:
- put_page(page);
+ folio_put(folio);
_leave(" = %d", ret);
return ret;
}
@@ -107,63 +120,65 @@ error:
*/
int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct page *subpage, void *fsdata)
{
+ struct folio *folio = page_folio(subpage);
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
unsigned long priv;
- unsigned int f, from = pos & (thp_size(page) - 1);
+ unsigned int f, from = offset_in_folio(folio, pos);
unsigned int t, to = from + copied;
- loff_t i_size, maybe_i_size;
+ loff_t i_size, write_end_pos;
_enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, page->index);
+ vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (copied < len) {
copied = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
if (copied == 0)
goto out;
- maybe_i_size = pos + copied;
+ write_end_pos = pos + copied;
i_size = i_size_read(&vnode->vfs_inode);
- if (maybe_i_size > i_size) {
+ if (write_end_pos > i_size) {
write_seqlock(&vnode->cb_lock);
i_size = i_size_read(&vnode->vfs_inode);
- if (maybe_i_size > i_size)
- afs_set_i_size(vnode, maybe_i_size);
+ if (write_end_pos > i_size)
+ afs_set_i_size(vnode, write_end_pos);
write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
}
- if (PagePrivate(page)) {
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ if (folio_test_private(folio)) {
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (from < f)
f = from;
if (to > t)
t = to;
- priv = afs_page_dirty(page, f, t);
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page);
+ priv = afs_folio_dirty(folio, f, t);
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
} else {
- priv = afs_page_dirty(page, from, to);
- attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page);
+ priv = afs_folio_dirty(folio, from, to);
+ folio_attach_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
}
- if (set_page_dirty(page))
- _debug("dirtied %lx", page->index);
+ if (folio_mark_dirty(folio))
+ _debug("dirtied %lx", folio_index(folio));
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -174,40 +189,32 @@ static void afs_kill_pages(struct address_space *mapping,
loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct pagevec pv;
- unsigned int loop, psize;
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
_enter("{%llx:%llu},%llx @%llx",
vnode->fid.vid, vnode->fid.vnode, len, start);
- pagevec_init(&pv);
-
do {
- _debug("kill %llx @%llx", len, start);
+ _debug("kill %lx (to %lx)", index, last);
- pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
- PAGEVEC_SIZE, pv.pages);
- if (pv.nr == 0)
- break;
-
- for (loop = 0; loop < pv.nr; loop++) {
- struct page *page = pv.pages[loop];
+ folio = filemap_get_folio(mapping, index);
+ if (!folio) {
+ next = index + 1;
+ continue;
+ }
- if (page->index * PAGE_SIZE >= start + len)
- break;
+ next = folio_next_index(folio);
- psize = thp_size(page);
- start += psize;
- len -= psize;
- ClearPageUptodate(page);
- end_page_writeback(page);
- lock_page(page);
- generic_error_remove_page(mapping, page);
- unlock_page(page);
- }
+ folio_clear_uptodate(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_page(mapping, &folio->page);
+ folio_unlock(folio);
+ folio_put(folio);
- __pagevec_release(&pv);
- } while (len > 0);
+ } while (index = next, index <= last);
_leave("");
}
@@ -220,37 +227,27 @@ static void afs_redirty_pages(struct writeback_control *wbc,
loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct pagevec pv;
- unsigned int loop, psize;
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
_enter("{%llx:%llu},%llx @%llx",
vnode->fid.vid, vnode->fid.vnode, len, start);
- pagevec_init(&pv);
-
do {
_debug("redirty %llx @%llx", len, start);
- pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
- PAGEVEC_SIZE, pv.pages);
- if (pv.nr == 0)
- break;
-
- for (loop = 0; loop < pv.nr; loop++) {
- struct page *page = pv.pages[loop];
-
- if (page->index * PAGE_SIZE >= start + len)
- break;
-
- psize = thp_size(page);
- start += psize;
- len -= psize;
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ folio = filemap_get_folio(mapping, index);
+ if (!folio) {
+ next = index + 1;
+ continue;
}
- __pagevec_release(&pv);
- } while (len > 0);
+ next = index + folio_nr_pages(folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
_leave("");
}
@@ -261,7 +258,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
struct address_space *mapping = vnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t end;
XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
@@ -272,15 +269,16 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign
rcu_read_lock();
end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, page, end) {
- if (!PageWriteback(page)) {
- kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end);
- ASSERT(PageWriteback(page));
+ xas_for_each(&xas, folio, end) {
+ if (!folio_test_writeback(folio)) {
+ kdebug("bad %x @%llx page %lx %lx",
+ len, start, folio_index(folio), end);
+ ASSERT(folio_test_writeback(folio));
}
- trace_afs_page_dirty(vnode, tracepoint_string("clear"), page);
- detach_page_private(page);
- page_endio(page, true, 0);
+ trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
+ folio_detach_private(folio);
+ folio_end_writeback(folio);
}
rcu_read_unlock();
@@ -356,9 +354,10 @@ static const struct afs_operation_ops afs_store_data_operation = {
static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
bool laundering)
{
+ struct netfs_i_context *ictx = &vnode->netfs_ctx;
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
- loff_t size = iov_iter_count(iter), i_size;
+ loff_t size = iov_iter_count(iter);
int ret = -ENOKEY;
_enter("%s{%llx:%llu.%u},%llx,%llx",
@@ -380,15 +379,13 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
return -ENOMEM;
}
- i_size = i_size_read(&vnode->vfs_inode);
-
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
op->file[0].modification = true;
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, i_size);
+ op->store.i_size = max(pos + size, ictx->remote_i_size);
op->store.laundering = laundering;
op->mtime = vnode->vfs_inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
@@ -434,10 +431,11 @@ static void afs_extend_writeback(struct address_space *mapping,
loff_t start,
loff_t max_len,
bool new_content,
+ bool caching,
unsigned int *_len)
{
struct pagevec pvec;
- struct page *page;
+ struct folio *folio;
unsigned long priv;
unsigned int psize, filler = 0;
unsigned int f, t;
@@ -456,43 +454,45 @@ static void afs_extend_writeback(struct address_space *mapping,
*/
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
+ xas_for_each(&xas, folio, ULONG_MAX) {
stop = true;
- if (xas_retry(&xas, page))
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
break;
- if (page->index != index)
+ if (folio_index(folio) != index)
break;
- if (!page_cache_get_speculative(page)) {
+ if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
continue;
}
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
break;
}
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
break;
}
- if (!PageDirty(page) || PageWriteback(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- psize = thp_size(page);
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ psize = folio_size(folio);
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (f != 0 && !new_content) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
@@ -503,8 +503,8 @@ static void afs_extend_writeback(struct address_space *mapping,
else if (t == psize || new_content)
stop = false;
- index += thp_nr_pages(page);
- if (!pagevec_add(&pvec, page))
+ index += folio_nr_pages(folio);
+ if (!pagevec_add(&pvec, &folio->page))
break;
if (stop)
break;
@@ -521,16 +521,17 @@ static void afs_extend_writeback(struct address_space *mapping,
break;
for (i = 0; i < pagevec_count(&pvec); i++) {
- page = pvec.pages[i];
- trace_afs_page_dirty(vnode, tracepoint_string("store+"), page);
+ folio = page_folio(pvec.pages[i]);
+ trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
BUG();
- if (test_set_page_writeback(page))
+ if (folio_start_writeback(folio))
BUG();
+ afs_folio_start_fscache(caching, folio);
- *_count -= thp_nr_pages(page);
- unlock_page(page);
+ *_count -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
pagevec_release(&pvec);
@@ -544,10 +545,10 @@ static void afs_extend_writeback(struct address_space *mapping,
* Synchronously write back the locked page and any subsequent non-locked dirty
* pages.
*/
-static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
- struct writeback_control *wbc,
- struct page *page,
- loff_t start, loff_t end)
+static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio,
+ loff_t start, loff_t end)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct iov_iter iter;
@@ -555,25 +556,27 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
unsigned int offset, to, len, max_len;
loff_t i_size = i_size_read(&vnode->vfs_inode);
bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+ bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
long count = wbc->nr_to_write;
int ret;
- _enter(",%lx,%llx-%llx", page->index, start, end);
+ _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
- if (test_set_page_writeback(page))
+ if (folio_start_writeback(folio))
BUG();
+ afs_folio_start_fscache(caching, folio);
- count -= thp_nr_pages(page);
+ count -= folio_nr_pages(folio);
/* Find all consecutive lockable dirty pages that have contiguous
* written regions, stopping when we find a page that is not
* immediately lockable, is not dirty or is missing, or we reach the
* end of the range.
*/
- priv = page_private(page);
- offset = afs_page_dirty_from(page, priv);
- to = afs_page_dirty_to(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("store"), page);
+ priv = (unsigned long)folio_get_private(folio);
+ offset = afs_folio_dirty_from(folio, priv);
+ to = afs_folio_dirty_to(folio, priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
len = to - offset;
start += offset;
@@ -586,9 +589,10 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
max_len = min_t(unsigned long long, max_len, i_size - start);
if (len < max_len &&
- (to == thp_size(page) || new_content))
+ (to == folio_size(folio) || new_content))
afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content, &len);
+ start, max_len, new_content,
+ caching, &len);
len = min_t(loff_t, len, max_len);
}
@@ -596,17 +600,23 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
* set; the first page is still locked at this point, but all the rest
* have been unlocked.
*/
- unlock_page(page);
+ folio_unlock(folio);
if (start < i_size) {
_debug("write back %x @%llx [%llx]", len, start, i_size);
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ afs_write_to_cache(vnode, start, len, i_size, caching);
+
iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
ret = afs_store_data(vnode, &iter, start, false);
} else {
_debug("write discard %x @%llx [%llx]", len, start, i_size);
/* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
afs_pages_written_back(vnode, start, len);
ret = 0;
}
@@ -657,16 +667,21 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
* write a page back to the server
* - the caller locked the page for us
*/
-int afs_writepage(struct page *page, struct writeback_control *wbc)
+int afs_writepage(struct page *subpage, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(subpage);
ssize_t ret;
loff_t start;
- _enter("{%lx},", page->index);
+ _enter("{%lx},", folio_index(folio));
+
+#ifdef CONFIG_AFS_FSCACHE
+ folio_wait_fscache(folio);
+#endif
- start = page->index * PAGE_SIZE;
- ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
- start, LLONG_MAX - start);
+ start = folio_index(folio) * PAGE_SIZE;
+ ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc,
+ folio, start, LLONG_MAX - start);
if (ret < 0) {
_leave(" = %zd", ret);
return ret;
@@ -683,9 +698,10 @@ static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
loff_t start, loff_t end, loff_t *_next)
{
- struct page *page;
+ struct folio *folio;
+ struct page *head_page;
ssize_t ret;
- int n;
+ int n, skips = 0;
_enter("%llx,%llx,", start, end);
@@ -693,13 +709,14 @@ static int afs_writepages_region(struct address_space *mapping,
pgoff_t index = start / PAGE_SIZE;
n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, 1, &page);
+ PAGECACHE_TAG_DIRTY, 1, &head_page);
if (!n)
break;
- start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */
+ folio = page_folio(head_page);
+ start = folio_pos(folio); /* May regress with THPs */
- _debug("wback %lx", page->index);
+ _debug("wback %lx", folio_index(folio));
/* At this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
@@ -707,37 +724,50 @@ static int afs_writepages_region(struct address_space *mapping,
* back from swapper_space to tmpfs file mapping
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = lock_page_killable(page);
+ ret = folio_lock_killable(folio);
if (ret < 0) {
- put_page(page);
+ folio_put(folio);
return ret;
}
} else {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return 0;
}
}
- if (page->mapping != mapping || !PageDirty(page)) {
- start += thp_size(page);
- unlock_page(page);
- put_page(page);
+ if (folio_mapping(folio) != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ folio_put(folio);
continue;
}
- if (PageWriteback(page)) {
- unlock_page(page);
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
- put_page(page);
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_AFS_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ } else {
+ start += folio_size(folio);
+ }
+ folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched())
+ break;
+ skips++;
+ }
continue;
}
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
BUG();
- ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end);
- put_page(page);
+ ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
+ folio_put(folio);
if (ret < 0) {
_leave(" = %zd", ret);
return ret;
@@ -861,7 +891,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = thp_head(vmf->page);
+ struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -869,7 +899,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
unsigned long priv;
vm_fault_t ret = VM_FAULT_RETRY;
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
afs_validate(vnode, af->key);
@@ -879,34 +909,34 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* be modified. We then assume the entire page will need writing back.
*/
#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- wait_on_page_fscache_killable(page) < 0)
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0)
goto out;
#endif
- if (wait_on_page_writeback_killable(page))
+ if (folio_wait_writeback_killable(folio))
goto out;
- if (lock_page_killable(page) < 0)
+ if (folio_lock_killable(folio) < 0)
goto out;
- /* We mustn't change page->private until writeback is complete as that
+ /* We mustn't change folio->private until writeback is complete as that
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- if (wait_on_page_writeback_killable(page) < 0) {
- unlock_page(page);
+ if (folio_wait_writeback_killable(folio) < 0) {
+ folio_unlock(folio);
goto out;
}
- priv = afs_page_dirty(page, 0, thp_size(page));
- priv = afs_page_dirty_mmapped(priv);
- if (PagePrivate(page)) {
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page);
+ priv = afs_folio_dirty(folio, 0, folio_size(folio));
+ priv = afs_folio_dirty_mmapped(priv);
+ if (folio_test_private(folio)) {
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
} else {
- attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page);
+ folio_attach_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
}
file_update_time(file);
@@ -947,38 +977,62 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/*
* Clean up a page during invalidation.
*/
-int afs_launder_page(struct page *page)
+int afs_launder_folio(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
struct iov_iter iter;
struct bio_vec bv[1];
unsigned long priv;
unsigned int f, t;
int ret = 0;
- _enter("{%lx}", page->index);
+ _enter("{%lx}", folio->index);
- priv = page_private(page);
- if (clear_page_dirty_for_io(page)) {
+ priv = (unsigned long)folio_get_private(folio);
+ if (folio_clear_dirty_for_io(folio)) {
f = 0;
- t = thp_size(page);
- if (PagePrivate(page)) {
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ t = folio_size(folio);
+ if (folio_test_private(folio)) {
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
}
- bv[0].bv_page = page;
+ bv[0].bv_page = &folio->page;
bv[0].bv_offset = f;
bv[0].bv_len = t - f;
iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
- trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
- ret = afs_store_data(vnode, &iter, page_offset(page) + f, true);
+ trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
+ ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
}
- trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
- detach_page_private(page);
- wait_on_page_fscache(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
+ folio_detach_private(folio);
+ folio_wait_fscache(folio);
return ret;
}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct afs_vnode *vnode = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error) &&
+ transferred_or_error != -ENOBUFS)
+ afs_invalidate_cache(vnode, 0);
+}
+
+/*
+ * Save the write to the cache also.
+ */
+static void afs_write_to_cache(struct afs_vnode *vnode,
+ loff_t start, size_t len, loff_t i_size,
+ bool caching)
+{
+ fscache_write_to_cache(afs_vnode_cache(vnode),
+ vnode->vfs_inode.i_mapping, start, len, i_size,
+ afs_write_to_cache_done, vnode, caching);
+}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 2b35cba8ad62..fdc7d675b4b0 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -83,25 +83,18 @@ static s64 linux_to_yfs_time(const struct timespec64 *t)
return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
}
-static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
-{
- struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
-
- x->mask = htonl(AFS_SET_MODE);
- x->mode = htonl(mode & S_IALLUGO);
- x->mtime_client = u64_to_xdr(0);
- x->owner = u64_to_xdr(0);
- x->group = u64_to_xdr(0);
- return bp + xdr_size(x);
-}
-
-static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+static __be32 *xdr_encode_YFSStoreStatus(__be32 *bp, mode_t *mode,
+ const struct timespec64 *t)
{
struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ mode_t masked_mode = mode ? *mode & S_IALLUGO : 0;
s64 mtime = linux_to_yfs_time(t);
+ u32 mask = AFS_SET_MTIME;
- x->mask = htonl(AFS_SET_MTIME);
- x->mode = htonl(0);
+ mask |= mode ? AFS_SET_MODE : 0;
+
+ x->mask = htonl(mask);
+ x->mode = htonl(masked_mode);
x->mtime_client = u64_to_xdr(mtime);
x->owner = u64_to_xdr(0);
x->group = u64_to_xdr(0);
@@ -576,7 +569,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
@@ -625,7 +618,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -946,6 +939,7 @@ void yfs_fs_symlink(struct afs_operation *op)
struct afs_vnode_param *dvp = &op->file[0];
struct afs_call *call;
size_t contents_sz;
+ mode_t mode = 0777;
__be32 *bp;
_enter("");
@@ -972,7 +966,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
bp = xdr_encode_string(bp, op->create.symlink, contents_sz);
- bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+ bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -1103,7 +1097,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u32(bp, YFSSTOREDATA64);
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid);
- bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime);
+ bp = xdr_encode_YFSStoreStatus(bp, NULL, &op->mtime);
bp = xdr_encode_u64(bp, op->store.pos);
bp = xdr_encode_u64(bp, op->store.size);
bp = xdr_encode_u64(bp, op->store.i_size);
diff --git a/fs/aio.c b/fs/aio.c
index 51b08ab01dff..3c249b938632 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -181,8 +181,9 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
bool cancelled;
+ bool work_scheduled;
+ bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
@@ -219,9 +220,35 @@ struct aio_kiocb {
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr; /* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+static unsigned long aio_nr; /* current system wide number of aio requests */
+static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
+#ifdef CONFIG_SYSCTL
+static struct ctl_table aio_sysctls[] = {
+ {
+ .procname = "aio-nr",
+ .data = &aio_nr,
+ .maxlen = sizeof(aio_nr),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "aio-max-nr",
+ .data = &aio_max_nr,
+ .maxlen = sizeof(aio_max_nr),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {}
+};
+
+static void __init aio_sysctl_init(void)
+{
+ register_sysctl_init("fs", aio_sysctls);
+}
+#else
+#define aio_sysctl_init() do { } while (0)
+#endif
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
@@ -274,6 +301,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ aio_sysctl_init();
return 0;
}
__initcall(aio_setup);
@@ -450,7 +478,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -659,8 +687,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
new_nr = (table ? table->nr : 1) * 4;
spin_unlock(&mm->ioctx_lock);
- table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
- new_nr, GFP_KERNEL);
+ table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -1417,7 +1444,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
{
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
@@ -1437,7 +1464,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
}
iocb->ki_res.res = res;
- iocb->ki_res.res2 = res2;
+ iocb->ki_res.res2 = 0;
iocb_put(iocb);
}
@@ -1451,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
/*
* If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
@@ -1508,7 +1534,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
}
}
@@ -1526,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
file = req->ki_filp;
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
@@ -1620,6 +1645,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb);
}
+/*
+ * Safely lock the waitqueue which the request is on, synchronizing with the
+ * case where the ->poll() provider decides to free its waitqueue early.
+ *
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
+ * is on req->head, and an RCU read lock was taken. Returns false if the
+ * request was already removed from its waitqueue (which might no longer exist).
+ */
+static bool poll_iocb_lock_wq(struct poll_iocb *req)
+{
+ wait_queue_head_t *head;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us, then check whether the request is still on the queue.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ head = smp_load_acquire(&req->head);
+ if (head) {
+ spin_lock(&head->lock);
+ if (!list_empty(&req->wait.entry))
+ return true;
+ spin_unlock(&head->lock);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static void poll_iocb_unlock_wq(struct poll_iocb *req)
+{
+ spin_unlock(&req->head->lock);
+ rcu_read_unlock();
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1639,14 +1709,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
- if (!mask && !READ_ONCE(req->cancelled)) {
- add_wait_queue(req->head, &req->wait);
- spin_unlock_irq(&ctx->ctx_lock);
- return;
- }
+ if (poll_iocb_lock_wq(req)) {
+ if (!mask && !READ_ONCE(req->cancelled)) {
+ /*
+ * The request isn't actually ready to be completed yet.
+ * Reschedule completion if another wakeup came in.
+ */
+ if (req->work_need_resched) {
+ schedule_work(&req->work);
+ req->work_need_resched = false;
+ } else {
+ req->work_scheduled = false;
+ }
+ poll_iocb_unlock_wq(req);
+ spin_unlock_irq(&ctx->ctx_lock);
+ return;
+ }
+ list_del_init(&req->wait.entry);
+ poll_iocb_unlock_wq(req);
+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
@@ -1658,13 +1741,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll;
- spin_lock(&req->head->lock);
- WRITE_ONCE(req->cancelled, true);
- if (!list_empty(&req->wait.entry)) {
- list_del_init(&req->wait.entry);
- schedule_work(&aiocb->poll.work);
- }
- spin_unlock(&req->head->lock);
+ if (poll_iocb_lock_wq(req)) {
+ WRITE_ONCE(req->cancelled, true);
+ if (!req->work_scheduled) {
+ schedule_work(&aiocb->poll.work);
+ req->work_scheduled = true;
+ }
+ poll_iocb_unlock_wq(req);
+ } /* else, the request was force-cancelled by POLLFREE already */
return 0;
}
@@ -1681,21 +1765,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;
- list_del_init(&req->wait.entry);
-
- if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ /*
+ * Complete the request inline if possible. This requires that three
+ * conditions be met:
+ * 1. An event mask must have been passed. If a plain wakeup was done
+ * instead, then mask == 0 and we have to call vfs_poll() to get
+ * the events, so inline completion isn't possible.
+ * 2. The completion work must not have already been scheduled.
+ * 3. ctx_lock must not be busy. We have to use trylock because we
+ * already hold the waitqueue lock, so this inverts the normal
+ * locking order. Use irqsave/irqrestore because not all
+ * filesystems (e.g. fuse) call this function with IRQs disabled,
+ * yet IRQs have to be disabled before ctx_lock is obtained.
+ */
+ if (mask && !req->work_scheduled &&
+ spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
- /*
- * Try to complete the iocb inline if we can. Use
- * irqsave/irqrestore because not all filesystems (e.g. fuse)
- * call this function with IRQs disabled and because IRQs
- * have to be disabled before ctx_lock is obtained.
- */
+ list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_allowed()) {
+ if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
@@ -1704,7 +1794,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
- schedule_work(&req->work);
+ /*
+ * Schedule the completion work if needed. If it was already
+ * scheduled, record that another wakeup came in.
+ *
+ * Don't remove the request from the waitqueue here, as it might
+ * not actually be complete yet (we won't know until vfs_poll()
+ * is called), and we must not miss any wakeups. POLLFREE is an
+ * exception to this; see below.
+ */
+ if (req->work_scheduled) {
+ req->work_need_resched = true;
+ } else {
+ schedule_work(&req->work);
+ req->work_scheduled = true;
+ }
+
+ /*
+ * If the waitqueue is being freed early but we can't complete
+ * the request inline, we have to tear down the request as best
+ * we can. That means immediately removing the request from its
+ * waitqueue and preventing all further accesses to the
+ * waitqueue via the request. We also need to schedule the
+ * completion work (done above). Also mark the request as
+ * cancelled, to potentially skip an unneeded call to ->poll().
+ */
+ if (mask & POLLFREE) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&req->head, NULL);
+ }
}
return 1;
}
@@ -1712,6 +1838,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table {
struct poll_table_struct pt;
struct aio_kiocb *iocb;
+ bool queued;
int error;
};
@@ -1722,11 +1849,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */
- if (unlikely(pt->iocb->poll.head)) {
+ if (unlikely(pt->queued)) {
pt->error = -EINVAL;
return;
}
+ pt->queued = true;
pt->error = 0;
pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1751,12 +1879,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->done = false;
req->cancelled = false;
+ req->work_scheduled = false;
+ req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
+ apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
@@ -1765,23 +1895,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock);
- if (likely(req->head)) {
- spin_lock(&req->head->lock);
- if (unlikely(list_empty(&req->wait.entry))) {
- if (apt.error)
+ if (likely(apt.queued)) {
+ bool on_queue = poll_iocb_lock_wq(req);
+
+ if (!on_queue || req->work_scheduled) {
+ /*
+ * aio_poll_wake() already either scheduled the async
+ * completion work, or completed the request inline.
+ */
+ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
+ /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
+ /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
- } else if (!req->done) { /* actually waiting for an event */
+ } else if (on_queue) {
+ /*
+ * Actually waiting for an event, so add the request to
+ * active_reqs so that it can be cancelled if needed.
+ */
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
+ if (on_queue)
+ poll_iocb_unlock_wq(req);
}
if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index a280156138ed..e0c3e33c4177 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -148,6 +148,35 @@ struct file *anon_inode_getfile(const char *name,
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);
+/**
+ * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * !S_PRIVATE anon inode rather than reuse the
+ * singleton anon inode and calls the
+ * inode_init_security_anon() LSM hook. This
+ * allows for both the inode to have its own
+ * security context and for the LSM to enforce
+ * policy on the inode's creation.
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ * @context_inode:
+ * [in] the logical relationship with the new inode (optional)
+ *
+ * The LSM may use @context_inode in inode_init_security_anon(), but a
+ * reference to it is not held. Returns the newly created file* or an error
+ * pointer. See the anon_inode_getfile() documentation for more information.
+ */
+struct file *anon_inode_getfile_secure(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode)
+{
+ return __anon_inode_getfile(name, fops, priv, flags,
+ context_inode, true);
+}
+
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
diff --git a/fs/attr.c b/fs/attr.c
index 473d21b3a86d..66899b6e9bd8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -35,7 +35,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
kuid_t uid)
{
kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid))
+ if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
@@ -62,7 +62,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, kgid)))
+ (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c1ba13d19024..b4b3567ac655 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
{
struct befs_inode_info *bi;
- bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 7f8544abf636..03139344568f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -188,7 +188,8 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = bfs_readpage,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fd691e4815c5..1926bec2c850 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep;
static struct inode *bfs_alloc_inode(struct super_block *sb)
{
struct bfs_inode_info *bi;
- bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a813b70f594e..63c7ebb0da89 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
+#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
@@ -156,7 +158,7 @@ static int padzero(unsigned long elf_bss)
#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
#define STACK_ROUND(sp, items) \
(((unsigned long) (sp - items)) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+#define STACK_ALLOC(sp, len) (sp -= len)
#endif
#ifndef ELF_BASE_PLATFORM
@@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,20 +1075,26 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * If we are loading ET_EXEC or we have already performed
- * the ET_DYN load_addr calculations, proceed normally.
+ * The first time through the loop, first_pt_load is true:
+ * layout will be calculated. Once set, use MAP_FIXED since
+ * we know we've already safely mapped the entire region with
+ * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (elf_ex->e_type == ET_EXEC || load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
+ } else if (elf_ex->e_type == ET_EXEC) {
+ /*
+ * This logic is run once for the first LOAD Program
+ * Header for ET_EXEC binaries. No special handling
+ * is needed.
+ */
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
- * Program Headers, and to calculate the entire
- * size of the ELF mapping (total_size). (Note that
- * load_addr_set is set to true later once the
- * initial mapping is performed.)
+ * Program Headers.
*
* There are effectively two types of ET_DYN
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
@@ -1108,7 +1115,7 @@ out_free_interp:
* Therefore, programs are loaded offset from
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
- * without MAP_FIXED).
+ * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
@@ -1117,7 +1124,7 @@ out_free_interp:
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
- elf_flags |= MAP_FIXED;
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
@@ -1130,6 +1137,24 @@ out_free_interp:
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
+ *
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1146,16 +1171,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1191,6 +1225,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1254,8 +1289,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1572,7 +1607,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
- strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ get_task_comm(psinfo->pr_fname, p);
return 0;
}
@@ -1606,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1638,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1662,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note)
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1675,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note)
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1731,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task,
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
- long signr, size_t *total)
+ long signr, struct elf_note_info *info)
{
- unsigned int i;
+ unsigned int note_iter, view_iter;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1747,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
- *total += notesize(&t->notes[0]);
+ info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
/*
* Each other regset might generate a note too. For each regset
- * that has no core_note_type or is inactive, we leave t->notes[i]
- * all zero and we'll know to skip writing it later.
+ * that has no core_note_type or is inactive, skip it.
*/
- for (i = 1; i < view->n; ++i) {
- const struct user_regset *regset = &view->regsets[i];
+ note_iter = 1;
+ for (view_iter = 1; view_iter < view->n; ++view_iter) {
+ const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
@@ -1773,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (ret < 0)
continue;
+ if (WARN_ON_ONCE(note_iter >= info->thread_notes))
+ break;
+
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
- *total += notesize(&t->notes[i]);
+ info->size += notesize(&t->notes[note_iter]);
+ note_iter++;
}
return 1;
@@ -1787,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1834,7 +1873,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -1859,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info))
return 0;
/*
@@ -1868,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2016,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2024,7 +2063,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
if (!elf_note_info_init(info))
return 0;
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
ets = kzalloc(sizeof(*ets), GFP_KERNEL);
if (!ets)
@@ -2037,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2059,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2156,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2165,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2188,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2213,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2233,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2271,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Align to page */
dump_skip_to(cprm, dataoff);
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2289,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
@@ -2311,3 +2344,7 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
+#include "binfmt_elf_test.c"
+#endif
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d8fd6030cbb..08d0c8797828 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = {
.load_binary = load_elf_fdpic_binary,
#ifdef CONFIG_ELF_CORE
.core_dump = elf_fdpic_core_dump,
-#endif
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
static int __init init_elf_fdpic_binfmt(void)
@@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1491,10 +1489,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
ct->task, &thread_status_size);
@@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dump_skip_to(cprm, dataoff);
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1652,7 +1647,6 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c
new file mode 100644
index 000000000000..11d734fec366
--- /dev/null
+++ b/fs/binfmt_elf_test.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+static void total_mapping_size_test(struct kunit *test)
+{
+ struct elf_phdr empty[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, },
+ { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, },
+ };
+ /*
+ * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \
+ * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}'
+ */
+ struct elf_phdr mount[] = {
+ { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, },
+ { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, },
+ { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, },
+ { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, },
+ { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, },
+ };
+ size_t mount_size = 0xE070;
+ /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */
+ struct elf_phdr unordered[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ };
+
+ /* No headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0);
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0);
+ /* Empty headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0);
+ /* No PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0);
+ /* Empty PT_LOAD and non-PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0);
+
+ /* Normal set of PT_LOADS, and expected size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size);
+ /* Unordered PT_LOADs result in same size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size);
+}
+
+static struct kunit_case binfmt_elf_test_cases[] = {
+ KUNIT_CASE(total_mapping_size_test),
+ {},
+};
+
+static struct kunit_suite binfmt_elf_test_suite = {
+ .name = KBUILD_MODNAME,
+ .test_cases = binfmt_elf_test_cases,
+};
+
+kunit_test_suite(binfmt_elf_test_suite);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d776f80ee50..626898150011 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,6 +37,7 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -97,13 +98,17 @@ static int load_flat_shared_library(int id, struct lib_info *p);
#endif
static int load_flat_binary(struct linux_binprm *);
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm);
+#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
+#ifdef CONFIG_COREDUMP
.core_dump = flat_core_dump,
.min_coredump = PAGE_SIZE
+#endif
};
/****************************************************************************/
@@ -112,12 +117,14 @@ static struct linux_binfmt flat_format = {
* Currently only a stub-function.
*/
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm)
{
pr_warn("Process %s:%d received signr %d and should have core dumped\n",
current->comm, current->pid, cprm->siginfo->si_signo);
return 1;
}
+#endif
/****************************************************************************/
/*
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 520a0f6a7d9e..183e5c4aed34 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -18,8 +18,7 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
- depends on !PPC_256K_PAGES # powerpc
- depends on !PAGE_SIZE_256KB # hexagon
+ depends on PAGE_SIZE_LESS_THAN_256KB
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3dcf9bcc2326..99f9995670ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags)
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
obj-$(CONFIG_BTRFS_FS) := btrfs.o
@@ -27,7 +28,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 309516e6a968..43c89952b7d2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work)
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
} else {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f735b8798ba1..ebc392ea1d74 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
}
+
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
@@ -950,7 +952,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
@@ -1049,12 +1051,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_path *path, u64 bytenr,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
int ret;
int slot;
struct extent_buffer *leaf;
@@ -1170,6 +1172,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct ulist *roots, const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1203,28 +1206,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
- /*
- * grab both a lock on the path and a lock on the delayed ref head.
- * We need both to get a consistent picture of how the refs look
- * at a specified point in time
- */
again:
head = NULL;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
time_seq != BTRFS_SEQ_LAST) {
-#else
- if (trans && time_seq != BTRFS_SEQ_LAST) {
-#endif
/*
- * look if there are updates for this ref queued and lock the
- * head
+ * We have a specific time_seq we care about and trans which
+ * means we have the path lock, we need to grab the ref head and
+ * lock it so we have a consistent view of the refs at the given
+ * time.
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -1271,7 +1272,7 @@ again:
&info_level, &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+ ret = add_keyed_refs(root, path, bytenr, info_level,
&preftrees, sc);
if (ret)
goto out;
@@ -1336,7 +1337,8 @@ again:
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1360,10 +1362,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
@@ -1740,6 +1750,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags_ret)
{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
int ret;
u64 flags;
u64 size = 0;
@@ -1755,11 +1766,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
key.objectid = logical;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1779,7 +1790,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
@@ -1962,7 +1973,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->extent_root);
+ trans = btrfs_attach_transaction(fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
@@ -2058,7 +2069,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u64 parent = 0;
int found = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2084,10 +2094,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item = btrfs_item_nr(slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
@@ -2143,7 +2152,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
@@ -2330,6 +2339,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_path *path = iter->path;
struct btrfs_extent_item *ei;
struct btrfs_key key;
@@ -2340,7 +2350,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
key.offset = (u64)-1;
iter->bytenr = bytenr;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (ret == 0) {
@@ -2364,7 +2374,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->end_ptr = (u32)(iter->item_ptr +
- btrfs_item_size_nr(path->nodes[0], path->slots[0]));
+ btrfs_item_size(path->nodes[0], path->slots[0]));
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
@@ -2383,7 +2393,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
/* If there is no inline backref, go search for keyed backref */
if (iter->cur_ptr >= iter->end_ptr) {
- ret = btrfs_next_item(fs_info->extent_root, path);
+ ret = btrfs_next_item(extent_root, path);
/* No inline nor keyed ref */
if (ret > 0) {
@@ -2404,7 +2414,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->item_ptr = iter->cur_ptr;
- iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
path->nodes[0], path->slots[0]));
}
@@ -2427,6 +2437,7 @@ release:
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
int ret;
@@ -2457,7 +2468,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
/* We're at keyed items, there is no inline item, go to the next one */
- ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
+ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ ret = btrfs_next_item(extent_root, iter->path);
if (ret)
return ret;
@@ -2469,7 +2481,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->cur_ptr = iter->item_ptr;
- iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
path->slots[0]);
return 0;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a3b830b8410a..0dd6de994199 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -123,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
@@ -144,6 +154,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
kfree(cache);
}
}
@@ -512,7 +523,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -527,6 +538,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -839,7 +851,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- root = fs_info->extent_root;
+ root = btrfs_block_group_root(fs_info);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
@@ -902,6 +914,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
if (!path) {
@@ -1103,6 +1116,7 @@ out:
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -1136,8 +1150,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
num_items = 3 + map->num_stripes;
free_extent_map(em);
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items);
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
/*
@@ -1484,19 +1497,37 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
- LIST_HEAD(again_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
return;
+ }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1504,10 +1535,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return;
}
spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
int ret = 0;
@@ -1561,21 +1599,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret && ret != -EAGAIN)
+ if (ret)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
next:
+ btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
- if (ret == -EAGAIN && list_empty(&bg->bg_list))
- list_add_tail(&bg->bg_list, &again_list);
- else
- btrfs_put_block_group(bg);
}
- list_splice_tail(&again_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -1654,7 +1689,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -1895,6 +1930,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
+ INIT_LIST_HEAD(&cache->active_bg_list);
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
@@ -1976,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
set_free_space_tree_thresholds(cache);
@@ -2035,6 +2072,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
*/
if (btrfs_is_zoned(info)) {
btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2101,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
@@ -2135,6 +2177,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
+ struct btrfs_root *root = btrfs_block_group_root(info);
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
@@ -2143,7 +2186,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!info->extent_root)
+ if (!root)
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2246,20 +2289,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_item bgi;
- struct btrfs_root *root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- root = fs_info->extent_root;
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
@@ -2409,6 +2451,27 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size)
@@ -2429,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
@@ -2479,7 +2544,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, 0, &cache->space_info);
+ cache->bytes_super, cache->zone_unusable,
+ &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2506,12 +2572,26 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
int ret;
bool dirty_bg_running;
+ /*
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ return ret;
+ }
+
do {
- trans = btrfs_join_transaction(fs_info->extent_root);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2594,7 +2674,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (!--cache->ro) {
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
- cache->zone_unusable = cache->alloc_offset - cache->used;
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
@@ -2614,7 +2696,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
@@ -2635,7 +2717,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
@@ -2858,7 +2940,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2924,7 +3005,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -3025,7 +3105,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3083,7 +3162,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3143,7 +3221,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
@@ -3367,7 +3445,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3380,36 +3458,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
*/
check_system_chunk(trans, flags);
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
* previously reserved space in the system space_info and allocated one
- * new system chunk if necessary. However there are two exceptions:
+ * new system chunk if necessary. However there are three exceptions:
*
* 1) We may have enough free space in the system space_info but all the
* existing system block groups have a profile which can not be used
@@ -3435,13 +3494,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
* with enough free space got turned into RO mode by a running scrub,
* and in this case we have to allocate a new one and retry. We only
* need do this allocate and retry once, since we have a transaction
- * handle and scrub uses the commit root to search for block groups.
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3466,7 +3532,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
out:
btrfs_trans_release_chunk_metadata(trans);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
}
/*
@@ -3519,7 +3589,15 @@ out:
* properly, either intentionally or as a bug. One example where this is
* done intentionally is fsync, as it does not reserve any transaction units
* and ends up allocating a variable number of metadata extents for log
- * tree extent buffers.
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
*
* We also need this 2 phases setup when adding a device to a filesystem with
* a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3615,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3573,19 +3651,42 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3650,9 +3751,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
@@ -3684,17 +3798,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3818,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3728,35 +3833,83 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
}
if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ ret = btrfs_block_rsv_add(fs_info,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
@@ -3821,9 +3974,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
- spin_unlock(&info->unused_bgs_lock);
- spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->reclaim_bgs)) {
block_group = list_first_entry(&info->reclaim_bgs,
struct btrfs_block_group,
@@ -3833,6 +3984,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
@@ -3879,9 +4040,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
* important and indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on bytes_reserved > 0 in
+ * that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ }
+
WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c72a71efcb18..e8308f2ad07d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -35,11 +35,15 @@ enum btrfs_discard_state {
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
struct btrfs_caching_control {
@@ -68,6 +72,7 @@ struct btrfs_block_group {
u64 bytes_super;
u64 flags;
u64 cache_generation;
+ u64 global_root_id;
/*
* If the free space extent count exceeds this number, convert the block
@@ -98,6 +103,7 @@ struct btrfs_block_group {
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
+ unsigned int zone_is_active:1;
int disk_cache_state;
@@ -202,7 +208,10 @@ struct btrfs_block_group {
*/
u64 alloc_offset;
u64 zone_unusable;
+ u64 zone_capacity;
u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +289,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc);
+ u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +298,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 04a6226e0388..b3ee49b0b1e8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,6 +6,7 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
+#include "disk-io.h"
/*
* HOW DO BLOCK RESERVES WORK
@@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
@@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
@@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
- u64 num_bytes;
- unsigned min_items;
+ struct btrfs_root *root, *tmp;
+ u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+ unsigned int min_items = 1;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
+ *
+ * We also are going to need to modify the minimum of the tree root and
+ * any global roots we could touch.
*/
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
- btrfs_root_used(&fs_info->csum_root->root_item) +
- btrfs_root_used(&fs_info->tree_root->root_item);
-
- /*
- * We at a minimum are going to modify the csum root, the tree root, and
- * the extent root.
- */
- min_items = 3;
+ read_lock(&fs_info->global_root_lock);
+ rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+ rb_node) {
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ num_bytes += btrfs_root_used(&root->root_item);
+ min_items++;
+ }
+ }
+ read_unlock(&fs_info->global_root_lock);
/*
* But we also want to reserve enough space so we can do the fallback
@@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_unlock(&sinfo->lock);
}
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ switch (root->root_key.objectid) {
+ case BTRFS_CSUM_TREE_OBJECTID:
+ case BTRFS_EXTENT_TREE_OBJECTID:
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ root->block_rsv = &fs_info->delayed_refs_rsv;
+ break;
+ case BTRFS_ROOT_TREE_OBJECTID:
+ case BTRFS_DEV_TREE_OBJECTID:
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ root->block_rsv = &fs_info->global_block_rsv;
+ break;
+ case BTRFS_CHUNK_TREE_OBJECTID:
+ root->block_rsv = &fs_info->chunk_block_rsv;
+ break;
+ default:
+ root->block_rsv = NULL;
+ break;
+ }
+}
+
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
- /*
- * Our various recovery options can leave us with NULL roots, so check
- * here and just bail before we go dereferencing NULLs everywhere.
- */
- if (!fs_info->extent_root || !fs_info->csum_root ||
- !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
- return;
-
- fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
- if (fs_info->quota_root)
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
btrfs_update_global_block_rsv(fs_info);
}
@@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv(
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- (root == fs_info->csum_root && trans->adding_csums) ||
- (root == fs_info->uuid_root))
+ (root == fs_info->uuid_root) ||
+ (trans->adding_csums &&
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -523,7 +539,7 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0b6ae5302837..3b67ff08d434 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -50,6 +50,7 @@ struct btrfs_block_rsv {
};
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
@@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 76ee1452c57b..32131a5d321b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -14,6 +14,13 @@
#include "delayed-inode.h"
/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -138,17 +145,26 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /* total number of bytes pending delalloc, used by stat to calc the
- * real block usage of the file
+ /*
+ * Total number of bytes pending delalloc, used by stat to calculate the
+ * real block usage of the file. This is used only for files.
*/
u64 delalloc_bytes;
- /*
- * Total number of bytes pending delalloc that fall within a file
- * range that is either a hole or beyond EOF (and no prealloc extent
- * exists in the range). This is always <= delalloc_bytes.
- */
- u64 new_delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
/*
* total number of bytes pending defrag, used by stat to check whether
@@ -164,8 +180,9 @@ struct btrfs_inode {
u64 disk_i_size;
/*
- * if this is a directory then index_cnt is the counter for the index
- * number for new files that are created
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
*/
u64 index_cnt;
@@ -324,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
spin_unlock(&inode->lock);
}
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
@@ -337,9 +384,25 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
return ret;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
struct btrfs_dio_private {
struct inode *inode;
- u64 logical_offset;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 86816088927f..abac86a75840 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
@@ -186,7 +185,6 @@ struct btrfsic_dev_state {
struct list_head collision_resolving_node; /* list node */
struct btrfsic_block dummy_block_for_bio_bh_flush;
u64 last_flush_gen;
- char name[BDEVNAME_SIZE];
};
struct btrfsic_block_hashtable {
@@ -403,7 +401,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
ds->bdev = NULL;
ds->state = NULL;
- ds->name[0] = '\0';
INIT_LIST_HEAD(&ds->collision_resolving_node);
ds->last_flush_gen = 0;
btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +753,10 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
- dev_state->name, dev_bytenr,
+ dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
@@ -938,9 +935,10 @@ continue_with_current_leaf_stack_frame:
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
leaf_item_out_of_bounce_error:
- pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1056,10 @@ continue_with_current_node_stack_frame:
(uintptr_t)nodehdr;
if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
sf->block_ctx->len) {
- pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(
@@ -1228,15 +1227,17 @@ static int btrfsic_create_link_to_next_block(
if (next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr))
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block),
next_block->logical_bytenr);
else
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block));
@@ -1324,8 +1325,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset +
offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
@@ -1344,8 +1345,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1422,10 @@ static int btrfsic_handle_extent_data(
next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
next_bytenr,
- next_block_ctx.dev->name,
+ next_block_ctx.dev->bdev,
next_block_ctx.dev_bytenr,
mirror_num,
next_block->logical_bytenr);
@@ -1455,7 +1457,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_bio *multi = NULL;
+ struct btrfs_io_context *multi = NULL;
struct btrfs_device *device;
length = len;
@@ -1561,7 +1563,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(num_pages - i);
+ bio = btrfs_bio_alloc(num_pages - i);
bio_set_dev(bio, block_ctx->dev->bdev);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1579,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %s!\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
bio_put(bio);
return -1;
}
@@ -1602,33 +1604,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
const struct btrfsic_block_link *l;
- pr_info("%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
}
@@ -1743,16 +1747,18 @@ again:
if (block->logical_bytenr != bytenr &&
!(!block->is_metadata &&
block->logical_bytenr == 0))
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- bytenr, dev_state->name,
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state,
block),
block->logical_bytenr);
else
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
dev_bytenr, block->mirror_num,
btrfsic_get_block_type(state,
block));
@@ -1767,8 +1773,9 @@ again:
processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
}
@@ -1778,9 +1785,10 @@ again:
list_empty(&block->ref_to_list) ? ' ' : '!',
list_empty(&block->ref_from_list) ? ' ' : '!');
if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_disk_key_objectid(&block->disk_key),
block->disk_key.type,
@@ -1792,9 +1800,10 @@ again:
}
if (!block->is_iodone && !block->never_written) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_stack_header_generation(
(struct btrfs_header *)
@@ -1921,8 +1930,9 @@ again:
if (!is_metadata) {
processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
- dev_state->name, dev_bytenr);
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
if (!state->include_extent_data) {
/* ignore that written D block */
goto continue_loop;
@@ -1939,8 +1949,9 @@ again:
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
}
block_ctx.dev = dev_state;
@@ -1995,9 +2006,9 @@ again:
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2052,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2063,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %s flush_gen=%llu\n",
- dev_state->name,
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2089,19 @@ static int btrfsic_process_written_superblock(
if (!(superblock->generation > state->max_superblock_generation ||
0 == state->max_superblock_generation)) {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
} else {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
@@ -2232,38 +2245,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
*/
list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
if (l->block_ref_to->never_written) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (!l->block_ref_to->is_iodone) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (l->block_ref_to->iodone_w_error) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
@@ -2273,10 +2290,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
l->parent_generation &&
BTRFSIC_GENERATION_UNKNOWN !=
l->block_ref_to->generation) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num,
l->block_ref_to->generation,
@@ -2284,10 +2302,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
ret = -1;
} else if (l->block_ref_to->flush_gen >
l->block_ref_to->dev_state->last_flush_gen) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num, block->flush_gen,
l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2343,16 @@ static int btrfsic_is_block_ref_by_superblock(
*/
list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2374,30 @@ static int btrfsic_is_block_ref_by_superblock(
static void btrfsic_print_add_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
@@ -2419,9 +2439,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
printk("[...]\n");
@@ -2542,10 +2562,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
block->never_written = never_written;
block->mirror_num = mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
additional_string,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2612,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
}
if (WARN_ON(!match)) {
- pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr,
state->metablock_size,
@@ -2601,8 +2622,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
if (ret)
continue;
- pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
- bytenr, block_ctx.dev->name,
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
block_ctx.dev_bytenr, mirror_num);
}
}
@@ -2675,8 +2696,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2773,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- const char *p;
if (!device->bdev || !device->name)
continue;
@@ -2763,10 +2784,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
}
ds->bdev = device->bdev;
ds->state = state;
- bdevname(ds->bdev, ds->name);
- ds->name[BDEVNAME_SIZE - 1] = '\0';
- p = kbasename(ds->name);
- strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
}
@@ -2844,9 +2861,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
- pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0913ee50e6c3..19bf36d8ffea 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -28,6 +29,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "subpage.h"
#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -94,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws,
}
}
-static int compression_decompress_bio(int type, struct list_head *ws,
- struct compressed_bio *cb)
+static int compression_decompress_bio(struct list_head *ws,
+ struct compressed_bio *cb)
{
- switch (type) {
+ switch (cb->compress_type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
@@ -155,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
- if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
+ if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return 0;
shash->tfm = fs_info->csum_shash;
@@ -180,9 +183,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- if (btrfs_io_bio(bio)->device)
+ if (btrfs_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
- btrfs_io_bio(bio)->device,
+ btrfs_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
@@ -193,6 +196,86 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
return 0;
}
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ unsigned int bi_size = 0;
+ bool last_io = false;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+ * Thus here we have to iterate through all segments to grab correct
+ * bio size.
+ */
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bi_size += bvec->bv_len;
+
+ if (bio->bi_status)
+ cb->status = bio->bi_status;
+
+ ASSERT(bi_size && bi_size <= cb->compressed_len);
+ last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+ &cb->pending_sectors);
+ /*
+ * Here we must wake up the possible error handler after all other
+ * operations on @cb finished, or we can race with
+ * finish_compressed_bio_*() which may free @cb.
+ */
+ wake_up_var(cb);
+
+ return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb)
+{
+ unsigned int index;
+ struct page *page;
+
+ /* Release the compressed pages */
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Do io completion on the original bio */
+ if (cb->status != BLK_STS_OK) {
+ cb->orig_bio->bi_status = cb->status;
+ bio_endio(cb->orig_bio);
+ } else {
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * We have verified the checksum already, set page checked so
+ * the end_io handlers know about it
+ */
+ ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+ u64 bvec_start = page_offset(bvec->bv_page) +
+ bvec->bv_offset;
+
+ btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+ bvec->bv_page, bvec_start,
+ bvec->bv_len);
+ }
+
+ bio_endio(cb->orig_bio);
+ }
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
@@ -207,32 +290,24 @@ static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
- struct page *page;
- unsigned int index;
- unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ unsigned int mirror = btrfs_bio(bio)->mirror_num;
int ret = 0;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
+ if (!dec_and_test_compressed_bio(cb, bio))
goto out;
/*
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ btrfs_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
/*
* Some IO in this cb have failed, just skip checksum as there
* is no way it could be correct.
*/
- if (cb->errors == 1)
+ if (cb->status != BLK_STS_OK)
goto csum_failed;
inode = cb->inode;
@@ -248,37 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
- cb->errors = 1;
-
- /* release the compressed pages */
- index = 0;
- for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
- page->mapping = NULL;
- put_page(page);
- }
-
- /* do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * we have verified the checksum already, set page
- * checked so the end_io handlers know about it
- */
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- SetPageChecked(bvec->bv_page);
-
- bio_endio(cb->orig_bio);
- }
-
- /* finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
+ cb->status = errno_to_blk_status(ret);
+ finish_compressed_bio_read(cb);
out:
bio_put(bio);
}
@@ -290,15 +336,17 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
+ const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
- if (cb->errors)
- mapping_set_error(inode->i_mapping, -EIO);
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -310,9 +358,10 @@ static noinline void end_compressed_writeback(struct inode *inode,
continue;
}
for (i = 0; i < ret; i++) {
- if (cb->errors)
+ if (errno)
SetPageError(pages[i]);
- end_page_writeback(pages[i]);
+ btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ cb->start, cb->len);
put_page(pages[i]);
}
nr_pages -= ret;
@@ -321,60 +370,128 @@ static noinline void end_compressed_writeback(struct inode *inode,
/* the inode may be gone now */
}
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- struct page *page;
+ struct inode *inode = cb->inode;
unsigned int index;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
-
- /* ok, we're the last bio for this extent, step one is to
- * call back into the FS and do all the end_io operations
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
*/
- inode = cb->inode;
- btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- !cb->errors);
+ cb->status == BLK_STS_OK);
- end_compressed_writeback(inode, cb);
- /* note, our inode could be gone now */
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
+ /* Note, our inode could be gone now */
/*
- * release the compressed pages, these came from alloc_page and
+ * Release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
- index = 0;
for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
+ struct page *page = cb->compressed_pages[index];
+
page->mapping = NULL;
put_page(page);
}
- /* finally free the cb struct */
+ /* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+ struct compressed_bio *cb = bio->bi_private;
+
+ if (!dec_and_test_compressed_bio(cb, bio))
+ goto out;
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+ finish_compressed_bio_write(cb);
out:
bio_put(bio);
}
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+ struct compressed_bio *cb,
+ struct bio *bio, int mirror_num)
+{
+ blk_status_t ret;
+
+ ASSERT(bio->bi_iter.bi_size);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ return ret;
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ unsigned int opf, bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bio->bi_opf = opf;
+ bio->bi_private = cb;
+ bio->bi_end_io = endio_func;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+
+ return bio;
+}
+
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
@@ -390,147 +507,138 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css)
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
- unsigned long bytes_left;
- int pg_index = 0;
- struct page *page;
- u64 first_byte = disk_start;
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
- WARN_ON(!PAGE_ALIGNED(start));
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_bios, 0);
- cb->errors = 0;
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
+ cb->writeback = writeback;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
-
- if (use_append) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
- if (IS_ERR(device)) {
- kfree(cb);
- bio_put(bio);
- return BLK_STS_NOTSUPP;
- }
-
- bio_set_dev(bio, device->bdev);
- }
-
- if (blkcg_css) {
- bio->bi_opf |= REQ_CGROUP_PUNT;
+ if (blkcg_css)
kthread_associate_blkcg(blkcg_css);
- }
- refcount_set(&cb->pending_bios, 1);
-
- /* create and submit bios for the compressed pages */
- bytes_left = compressed_len;
- for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- int submit = 0;
- int len = 0;
-
- page = compressed_pages[pg_index];
- page->mapping = inode->vfs_inode.i_mapping;
- if (bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 0);
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ bio = NULL;
+ goto finish_cb;
+ }
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ }
/*
- * Page can only be added to bio if the current bio fits in
- * stripe.
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
*/
- if (!submit) {
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page,
- PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
- }
-
- page->mapping = NULL;
- if (submit || len < PAGE_SIZE) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
+ if (ret)
+ goto finish_cb;
}
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
-
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
- if (blkcg_css)
- bio->bi_opf |= REQ_CGROUP_PUNT;
- /*
- * Use bio_add_page() to ensure the bio has at least one
- * page.
- */
- bio_add_page(bio, page, PAGE_SIZE, 0);
- }
- if (bytes_left < PAGE_SIZE) {
- btrfs_info(fs_info,
- "bytes left %lu compress len %u nr %u",
- bytes_left, cb->compressed_len, cb->nr_pages);
+ ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ if (ret)
+ goto finish_cb;
+ bio = NULL;
}
- bytes_left -= PAGE_SIZE;
- first_byte += PAGE_SIZE;
cond_resched();
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ return 0;
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
+finish_cb:
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
+ if (bio) {
bio->bi_status = ret;
bio_endio(bio);
}
+ /* Last byte of @cb is submitted, endio will free @cb */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ return ret;
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_start + compressed_len - cur_disk_bytenr) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_write(cb);
+ return ret;
}
static u64 bio_end_offset(struct bio *bio)
@@ -540,25 +648,33 @@ static u64 bio_end_offset(struct bio *bio)
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- unsigned long pg_index;
- u64 last_offset;
+ u64 cur = bio_end_offset(cb->orig_bio);
u64 isize = i_size_read(inode);
int ret;
struct page *page;
- unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
- u64 end;
- int misses = 0;
+ int sectors_missed = 0;
- last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
@@ -577,18 +693,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_SHIFT;
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
- misses++;
- if (misses > 4)
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
break;
- goto next;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -598,14 +725,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
- goto next;
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
- /*
- * at this point, we have a locked page in the page cache
- * for these bytes in the file. But, we have to make
- * sure they map to this compressed extent on disk.
- */
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -613,18 +737,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
}
- end = last_offset + PAGE_SIZE - 1;
- lock_extent(tree, last_offset, end);
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
- if (!em || last_offset < em->start ||
- (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end);
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
@@ -642,20 +770,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- ret = bio_add_page(cb->orig_bio, page,
- PAGE_SIZE, 0);
-
- if (ret == PAGE_SIZE) {
- nr_pages++;
- put_page(page);
- } else {
- unlock_extent(tree, last_offset, end);
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
}
-next:
- last_offset += PAGE_SIZE;
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
}
return 0;
}
@@ -680,14 +811,15 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
- struct page *page;
- struct bio *comp_bio;
- u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
u64 file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
- blk_status_t ret = BLK_STS_RESOURCE;
+ blk_status_t ret;
int faili = 0;
u8 *sums;
@@ -700,17 +832,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
- if (!em)
- return BLK_STS_IOERR;
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
- if (!cb)
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
goto out;
+ }
- refcount_set(&cb->pending_bios, 0);
- cb->errors = 0;
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ cb->status = BLK_STS_OK;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = cb->sums;
@@ -730,8 +866,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
- if (!cb->compressed_pages)
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
goto fail1;
+ }
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
@@ -749,87 +887,75 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
- refcount_set(&cb->pending_bios, 1);
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- u32 pg_len = PAGE_SIZE;
- int submit = 0;
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ ret = errno_to_blk_status(PTR_ERR(comp_bio));
+ comp_bio = NULL;
+ goto finish_cb;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
/*
- * To handle subpage case, we need to make sure the bio only
- * covers the range we need.
- *
- * If we're at the last page, truncate the length to only cover
- * the remaining part.
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
*/
- if (pg_index == nr_pages - 1)
- pg_len = min_t(u32, PAGE_SIZE,
- compressed_len - pg_index * PAGE_SIZE);
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
- page = cb->compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_SHIFT;
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
- if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, pg_len,
- comp_bio, 0);
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
- page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+ if (submit) {
unsigned int nr_sectors;
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
-
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto finish_cb;
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
-
- bio_add_page(comp_bio, page, pg_len, 0);
+ ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ if (ret)
+ goto finish_cb;
+ comp_bio = NULL;
}
- cur_disk_byte += pg_len;
- }
-
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
}
-
- return 0;
+ return BLK_STS_OK;
fail2:
while (faili >= 0) {
@@ -842,6 +968,28 @@ fail1:
kfree(cb);
out:
free_extent_map(em);
+ bio->bi_status = ret;
+ bio_endio(bio);
+ return ret;
+finish_cb:
+ if (comp_bio) {
+ comp_bio->bi_status = ret;
+ bio_endio(comp_bio);
+ }
+ /* All bytes of @cb is submitted, endio will free @cb */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ return ret;
+
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_bytenr + compressed_len - cur_disk_byte) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish @cb manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_read(cb);
return ret;
}
@@ -1231,7 +1379,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = compression_decompress_bio(type, workspace, cb);
+ ret = compression_decompress_bio(workspace, cb);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 399be0b435bf..ac5b20731d2a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -22,14 +22,16 @@ struct btrfs_inode;
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
+ /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+ refcount_t pending_sectors;
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -52,8 +54,11 @@ struct compressed_bio {
/* The compression algorithm for this bio */
u8 compress_type;
+ /* Whether this is a write for writeback. */
+ bool writeback;
+
/* IO errors */
- u8 errors;
+ blk_status_t status;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -95,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css);
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..0eecf98d0abb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -462,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -484,8 +485,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
}
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -725,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
}
/*
- * search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart.
+ * Search for a key in the given extent_buffer.
*
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
*
- * Slot may point to total number of items if the key is bigger than
- * all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb,
- unsigned long p, int item_size,
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
const struct btrfs_key *key, int *slot)
{
- int low = 0;
+ unsigned long p;
+ int item_size;
int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
@@ -752,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
return -EINVAL;
}
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
while (low < high) {
unsigned long oip;
unsigned long offset;
@@ -790,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
}
/*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot)
{
- if (btrfs_header_level(eb) == 0)
- return generic_bin_search(eb,
- offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item), key, slot);
- else
- return generic_bin_search(eb,
- offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr), key, slot);
+ return generic_bin_search(eb, 0, key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -842,9 +846,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -926,7 +932,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -985,7 +991,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
@@ -1030,7 +1037,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
@@ -1344,33 +1351,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
{
int i;
int skip_level = level;
- int no_skips = 0;
- struct extent_buffer *t;
+ bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
if (!path->locks[i])
break;
- if (!no_skips && path->slots[i] == 0) {
- skip_level = i + 1;
- continue;
- }
- if (!no_skips && path->keep_locks) {
- u32 nritems;
- t = path->nodes[i];
- nritems = btrfs_header_nritems(t);
- if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
}
- if (skip_level < i && i >= lowest_unlock)
- no_skips = 1;
- t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) {
- btrfs_tree_unlock_rw(t, path->locks[i]);
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -1430,13 +1438,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ *eb_ret = tmp;
+ return 0;
}
/*
@@ -1454,19 +1462,19 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
- } else {
- ret = PTR_ERR(tmp);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
}
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+ free_extent_buffer(tmp);
btrfs_release_path(p);
return ret;
@@ -1566,35 +1574,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
- /*
- * The commit roots are read only so we always do read locks,
- * and we always must hold the commit_root_sem when doing
- * searches on them, the only exception is send where we don't
- * want to block transaction commits for a long time, so
- * we need to clone the commit root in order to avoid races
- * with transaction commits that create a snapshot of one of
- * the roots used by a send operation.
- */
- if (p->need_commit_sem) {
- down_read(&fs_info->commit_root_sem);
- b = btrfs_clone_extent_buffer(root->commit_root);
- up_read(&fs_info->commit_root_sem);
- if (!b)
- return ERR_PTR(-ENOMEM);
-
- } else {
- b = root->commit_root;
- atomic_inc(&b->refs);
- }
+ b = root->commit_root;
+ atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -1611,6 +1597,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -1637,6 +1626,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -1646,6 +1646,191 @@ out:
return b;
}
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
@@ -1682,6 +1867,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -1723,6 +1909,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
min_write_lock_level = write_lock_level;
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ down_read(&fs_info->commit_root_sem);
+ }
+
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -1776,10 +1967,6 @@ again:
}
cow_done:
p->nodes[level] = b;
- /*
- * Leave path with blocking locks to avoid massive
- * lock context switch, this is made on purpose.
- */
/*
* we have a lock on b and as long as we aren't changing
@@ -1801,62 +1988,22 @@ cow_done:
}
}
- /*
- * If btrfs_bin_search returns an exact match (prev_cmp == 0)
- * we can safely assume the target key will always be in slot 0
- * on lower levels due to the invariants BTRFS' btree provides,
- * namely that a btrfs_key_ptr entry always points to the
- * lowest key in the child node, thus we can skip searching
- * lower levels
- */
- if (prev_cmp == 0) {
- slot = 0;
- ret = 0;
- } else {
- ret = btrfs_bin_search(b, key, &slot);
- prev_cmp = ret;
- if (ret < 0)
- goto done;
- }
-
if (level == 0) {
- p->slots[level] = slot;
- /*
- * Item key already exists. In this case, if we are
- * allowed to insert the item (for example, in dir_item
- * case, item key collision is allowed), it will be
- * merged with the original item. Only the item size
- * grows, no new btrfs item will be added. If
- * search_for_extension is not set, ins_len already
- * accounts the size btrfs_item, deduct it here so leaf
- * space check will be correct.
- */
- if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
- ASSERT(ins_len >= sizeof(struct btrfs_item));
- ins_len -= sizeof(struct btrfs_item);
- }
- if (ins_len > 0 &&
- btrfs_leaf_free_space(b) < ins_len) {
- if (write_lock_level < 1) {
- write_lock_level = 1;
- btrfs_release_path(p);
- goto again;
- }
-
- err = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
- BUG_ON(err > 0);
- if (err) {
- ret = err;
- goto done;
- }
- }
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
if (ret && slot > 0) {
dec = 1;
slot--;
@@ -1917,6 +2064,16 @@ cow_done:
done:
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
@@ -2487,7 +2644,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2614,19 +2771,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
- struct btrfs_item *start_item;
- struct btrfs_item *end_item;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- start_item = btrfs_item_nr(start);
- end_item = btrfs_item_nr(end);
- data_len = btrfs_item_offset(l, start_item) +
- btrfs_item_size(l, start_item);
- data_len = data_len - btrfs_item_offset(l, end_item);
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -2673,7 +2825,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
u32 i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
@@ -2690,8 +2841,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] > i)
break;
@@ -2706,12 +2855,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(left, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
if (i == 0)
break;
i--;
@@ -2725,7 +2875,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* push left to right */
right_nritems = btrfs_header_nritems(right);
- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
@@ -2756,9 +2906,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
left_nritems -= push_items;
@@ -2827,7 +2976,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -2843,16 +2992,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
@@ -2903,7 +3047,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
@@ -2917,8 +3060,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] < i)
break;
@@ -2933,12 +3074,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(right, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
}
if (push_items == 0) {
@@ -2954,25 +3096,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
- btrfs_item_offset_nr(right, push_items - 1);
+ btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset_nr(right, push_items - 1),
+ btrfs_item_offset(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
-
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2983,7 +3123,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
right_nritems);
if (push_items < right_nritems) {
- push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -3001,10 +3141,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
-
- push_space = push_space - btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -3065,7 +3203,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -3083,7 +3221,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
path->nodes[1], slot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -3094,12 +3231,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
goto out;
@@ -3132,7 +3263,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -3143,15 +3274,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -3267,7 +3397,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ if (extend && data_size + btrfs_item_size(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
@@ -3436,7 +3566,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -3456,7 +3586,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
@@ -3487,9 +3617,7 @@ static noinline int split_item(struct btrfs_path *path,
unsigned long split_offset)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int slot;
+ int orig_slot, slot;
char *buf;
u32 nritems;
u32 item_size;
@@ -3499,9 +3627,9 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- item = btrfs_item_nr(path->slots[0]);
- orig_offset = btrfs_item_offset(leaf, item);
- item_size = btrfs_item_size(leaf, item);
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
@@ -3522,14 +3650,12 @@ static noinline int split_item(struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(slot);
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
- btrfs_set_item_offset(leaf, new_item, orig_offset);
- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
-
- btrfs_set_item_offset(leaf, item,
- orig_offset + item_size - split_offset);
- btrfs_set_item_size(leaf, item, split_offset);
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
@@ -3581,40 +3707,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size, 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -3624,7 +3716,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
@@ -3636,14 +3727,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
leaf = path->nodes[0];
slot = path->slots[0];
- old_size = btrfs_item_size_nr(leaf, slot);
+ old_size = btrfs_item_size(leaf, slot);
if (old_size == new_size)
return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- old_data_start = btrfs_item_offset_nr(leaf, slot);
+ old_data_start = btrfs_item_offset(leaf, slot);
size_diff = old_size - new_size;
@@ -3657,10 +3748,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
}
/* shift the data */
@@ -3703,8 +3793,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
fixup_low_keys(path, &disk_key, 1);
}
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, new_size);
+ btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3720,7 +3809,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
@@ -3738,7 +3826,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
BUG();
}
slot = path->slots[0];
- old_data = btrfs_item_end_nr(leaf, slot);
+ old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
if (slot >= nritems) {
@@ -3755,10 +3843,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
}
/* shift the data */
@@ -3767,9 +3854,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end, old_data - data_end);
data_end = old_data;
- old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, old_size + data_size);
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3785,16 +3871,12 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
- * @cpu_key: array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr: size of @cpu_key/@data_size arrays
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
@@ -3803,14 +3885,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
u32 total_size;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
- total_size = total_data + (nr * sizeof(struct btrfs_item));
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -3820,6 +3902,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -3830,7 +3913,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
@@ -3846,34 +3929,33 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
- ioff - total_data);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(slot + i);
- data_end -= data_size[i];
- btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, data_size[i]);
+ data_end -= batch->data_sizes[i];
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3883,26 +3965,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -3911,7 +4010,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -3943,6 +4042,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
@@ -4015,7 +4148,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4027,25 +4160,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- struct btrfs_item *item;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size_nr(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4056,9 +4186,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4086,24 +4215,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
@@ -4385,7 +4540,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int level;
struct extent_buffer *c;
struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ bool need_commit_sem = false;
u32 nritems;
int ret;
int i;
@@ -4402,14 +4559,20 @@ again:
path->keep_locks = 1;
- if (time_seq)
+ if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
- else
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ down_read(&fs_info->commit_root_sem);
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
path->keep_locks = 0;
if (ret < 0)
- return ret;
+ goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
@@ -4532,6 +4695,15 @@ again:
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c0cebcf745ce..077c95e9baa5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,8 @@ extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
+struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -142,6 +144,13 @@ enum {
BTRFS_FS_STATE_DEV_REPLACING,
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -217,6 +226,9 @@ struct btrfs_root_backup {
u8 unused_8[10];
} __attribute__ ((__packed__));
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@@ -265,11 +277,21 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
+ /* Extent tree v2 */
+ __le64 block_group_root;
+ __le64 block_group_root_generation;
+ u8 block_group_root_level;
+
/* future expansion */
- __le64 reserved[28];
+ u8 reserved8[7];
+ __le64 reserved[25];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* Compat flags that we support. If any incompat flags are set other than the
@@ -287,6 +309,26 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -301,6 +343,7 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -503,11 +546,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
@@ -545,7 +583,6 @@ struct btrfs_swapfile_pin {
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
enum {
- BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
BTRFS_FS_LOG_RECOVERING,
@@ -568,7 +605,6 @@ enum {
/*
* Indicate that relocation of a chunk has started, it's set per chunk
* and is toggled between chunks.
- * Set, tested and cleared while holding fs_info::send_reloc_lock.
*/
BTRFS_FS_RELOC_RUNNING,
@@ -593,6 +629,12 @@ enum {
/* Indicate whether there are any tree modification log users */
BTRFS_FS_TREE_MOD_LOG_USERS,
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -605,6 +647,7 @@ enum {
*/
enum btrfs_exclusive_operation {
BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
BTRFS_EXCLOP_BALANCE,
BTRFS_EXCLOP_DEV_ADD,
BTRFS_EXCLOP_DEV_REMOVE,
@@ -616,20 +659,22 @@ enum btrfs_exclusive_operation {
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
- struct btrfs_root *extent_root;
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
- struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
- struct btrfs_root *free_space_root;
struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
@@ -665,6 +710,12 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
u64 avg_delayed_ref_runtime;
/*
@@ -807,7 +858,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
- struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -899,6 +949,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -943,13 +994,6 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* readahead tree */
- spinlock_t reada_lock;
- struct radix_tree_root reada_tree;
-
- /* readahead works cnt */
- atomic_t reada_works_cnt;
-
/* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
@@ -994,13 +1038,6 @@ struct btrfs_fs_info {
struct crypto_shash *csum_shash;
- spinlock_t send_reloc_lock;
- /*
- * Number of send operations in progress.
- * Updated while holding fs_info::send_reloc_lock.
- */
- int send_in_progress;
-
/* Type of exclusive operation running, protected by super_lock */
enum btrfs_exclusive_operation exclusive_operation;
@@ -1017,6 +1054,19 @@ struct btrfs_fs_info {
spinlock_t treelog_bg_lock;
u64 treelog_bg;
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1091,8 +1141,17 @@ enum {
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
+ /* We started the orphan cleanup for this root. */
+ BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
};
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -1109,6 +1168,8 @@ struct btrfs_qgroup_swapped_blocks {
* and for the extent tree extent_root root.
*/
struct btrfs_root {
+ struct rb_node rb_node;
+
struct extent_buffer *node;
struct extent_buffer *commit_root;
@@ -1159,8 +1220,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- int orphan_cleanup_state;
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
@@ -1584,25 +1643,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
type *s, u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
@@ -1633,8 +1692,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
@@ -1642,8 +1701,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
@@ -1941,8 +2000,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
}
/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
@@ -1957,25 +2016,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
- struct btrfs_item *item)
-{
- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
+ int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
}
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_end(eb, btrfs_item_nr(nr));
-}
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
{
- return btrfs_item_offset(eb, btrfs_item_nr(nr));
-}
-
-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_size(eb, btrfs_item_nr(nr));
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
}
static inline void btrfs_item_key(const struct extent_buffer *eb,
@@ -2238,6 +2308,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2287,6 +2362,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
+/*
+ * For extent tree v2 we overload the extent root with the block group root, as
+ * we will have multiple extent roots.
+ */
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
+ struct btrfs_root_backup, extent_root_level, 8);
+
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2421,6 +2507,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
+ block_group_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
+ struct btrfs_super_block,
+ block_group_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
+ block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2439,7 +2532,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
if (nr == 0)
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset_nr(leaf, nr - 1);
+ return btrfs_item_offset(leaf, nr - 1);
}
/* struct btrfs_file_extent_item */
@@ -2498,9 +2591,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
*/
static inline u32 btrfs_file_extent_inline_item_len(
const struct extent_buffer *eb,
- struct btrfs_item *e)
+ int nr)
{
- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* btrfs_qgroup_status_item */
@@ -2592,11 +2685,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
@@ -2700,7 +2793,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -2798,7 +2891,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -2885,16 +2979,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+ const struct btrfs_item_batch *batch);
static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2902,7 +3022,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
const struct btrfs_key *key,
u32 data_size)
{
- return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3062,36 +3189,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-item.c */
-int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index);
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
- struct btrfs_key *location, int mod);
-
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
-
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
-struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
@@ -3111,7 +3208,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig);
+ u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -3129,8 +3226,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3240,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len);
int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3151,10 +3248,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int front);
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode, u64 new_size,
- u32 min_type, u64 *extents_found);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -3174,8 +3267,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3218,6 +3309,11 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3242,9 +3338,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_pages);
+ u64 newer_than, unsigned long max_to_defrag);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3255,12 +3351,15 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
/* file.c */
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
@@ -3277,6 +3376,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -3563,6 +3664,12 @@ do { \
(errno), fmt, ##args); \
} while (0)
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
+
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3727,7 +3834,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
@@ -3768,23 +3875,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
btrfs_bio_counter_sub(fs_info, 1);
}
-/* reada.c */
-struct reada_control {
- struct btrfs_fs_info *fs_info; /* tree to prefetch */
- struct btrfs_key key_start;
- struct btrfs_key key_end; /* exclusive */
- atomic_t elems;
- struct kref refcnt;
- wait_queue_head_t wait;
-};
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *start, struct btrfs_key *end);
-int btrfs_reada_wait(void *handle);
-void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct extent_buffer *eb, int err);
-void btrfs_reada_remove_dev(struct btrfs_device *dev);
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -3842,6 +3932,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
@@ -3851,5 +3946,8 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
#define PageOrdered(page) PagePrivate2(page)
#define SetPageOrdered(page) SetPagePrivate2(page)
#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2059d1504149..bd8267c4687d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -143,10 +143,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
- if (ret < 0)
+ if (ret < 0) {
btrfs_free_reserved_data_space_noquota(fs_info, len);
- else
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ } else {
ret = 0;
+ }
return ret;
}
@@ -267,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
- u64 num_bytes, u64 *meta_reserve,
- u64 *qgroup_reserve)
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
@@ -285,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
@@ -326,12 +331,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
- &qgroup_reserve);
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -346,7 +351,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -451,9 +456,12 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len);
- if (ret < 0)
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len);
+ if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1e08eb2b27f0..748bf6b0d860 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
+#include "inode-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -629,7 +630,7 @@ static int btrfs_delayed_inode_reserve_metadata(
BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
/* NO_FLUSH could only fail with -ENOSPC */
ASSERT(ret == 0 || ret == -ENOSPC);
@@ -679,19 +680,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
- LIST_HEAD(batch);
+ LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ struct btrfs_item_batch batch;
int total_size;
- int nitems;
char *ins_data = NULL;
- struct btrfs_key *ins_keys;
- u32 *ins_sizes;
int ret;
- list_add_tail(&first_item->tree_list, &batch);
- nitems = 1;
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_item->data_len;
+ batch.nr = 1;
total_size = first_item->data_len + sizeof(struct btrfs_item);
curr = first_item;
@@ -706,39 +706,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
if (total_size + next_size > max_size)
break;
- list_add_tail(&next->tree_list, &batch);
- nitems++;
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
total_size += next_size;
+ batch.total_data_size += next->data_len;
curr = next;
}
- if (nitems == 1) {
- ins_keys = &first_item->key;
- ins_sizes = &first_item->data_len;
+ if (batch.nr == 1) {
+ batch.keys = &first_item->key;
+ batch.data_sizes = &first_item->data_len;
} else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(nitems * sizeof(u32) +
- nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data) {
ret = -ENOMEM;
goto out;
}
ins_sizes = (u32 *)ins_data;
- ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
- list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
ins_keys[i] = curr->key;
ins_sizes[i] = curr->data_len;
i++;
}
}
- ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
- nitems);
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
goto out;
- list_for_each_entry(curr, &batch, tree_list) {
+ list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +758,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
- list_for_each_entry_safe(curr, next, &batch, tree_list) {
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca848b183474..4176df149d04 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
num_bytes = btrfs_calc_insert_metadata_size(fs_info,
trans->delayed_ref_updates);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
delayed_rsv->full = 0;
@@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -906,7 +927,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +942,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(generic_ref->real_root) &&
- is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -938,14 +957,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.root, action, ref_type);
- ref->root = generic_ref->tree_ref.root;
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.root, 0, action, false,
- is_system);
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +1017,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 ref_root = generic_ref->data_ref.owning_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1026,8 +1046,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root) &&
- is_fstree(generic_ref->real_root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e22fba272e4f..91a3aabad150 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Root which refers to this data extent */
- u64 ref_root;
+ /* Original root this data extent belongs to */
+ u64 owning_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which refers to this tree block.
+ * Root which owns this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 root;
+ u64 owning_root;
/* For non-skinny metadata, no special member needed */
};
@@ -231,17 +231,10 @@ struct btrfs_ref {
*/
bool skip_qgroup;
- /*
- * Optional. For which root is this modification.
- * Mostly used for qgroup optimization.
- *
- * When unset, data/tree ref init code will populate it.
- * In certain cases, we're modifying reference for a different root.
- * E.g. COW fs tree blocks for balance.
- * In that case, tree_ref::root will be fs tree, but we're doing this
- * for reloc tree, then we should set @real_root to reloc tree.
- */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
u64 real_root;
+#endif
u64 bytenr;
u64 len;
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root)
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = root;
+ generic_ref->real_root = mod_root ?: root;
+#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.root = root;
+ generic_ref->tree_ref.owning_root = root;
generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset)
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = ref_root;
- generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
}
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d029be40ea6f..f26202621989 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
@@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found:
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -245,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
@@ -273,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -283,8 +282,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
@@ -305,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
@@ -323,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
+ device->fs_devices = fs_devices;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
*device_out = device;
return 0;
@@ -384,7 +385,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
@@ -733,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -909,9 +915,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- if (!scrub_ret)
- btrfs_reada_remove_dev(src_device);
-
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
@@ -920,7 +923,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
@@ -971,7 +973,6 @@ error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7721ce0c0604..3b532bab0755 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
- struct btrfs_item *item;
struct extent_buffer *leaf;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
- BUG_ON(data_size > btrfs_item_size(leaf, item));
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
return (struct btrfs_dir_item *)ptr;
}
@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
- if (data_size + btrfs_item_size_nr(leaf, slot) +
+ if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
@@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ total_len = btrfs_item_size(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
btrfs_dir_name_len(leaf, dir_item) +
@@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di);
- item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_len = btrfs_item_size(leaf, path->slots[0]);
if (sub_item_len == item_len) {
ret = btrfs_del_item(trans, root, path);
} else {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 355ea88d5c5f..31c3f592e587 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
else
ret = btrfs_check_leaf_full(eb);
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
}
/* Checksum all dirty extent buffers in one bio_vec */
@@ -665,9 +679,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
if (ret < 0)
goto err;
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
set_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
@@ -683,7 +694,7 @@ err:
return ret;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror)
{
@@ -715,10 +726,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
}
ret = validate_extent_buffer(eb);
err:
- if (reads_done &&
- test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
if (ret) {
/*
* our io error hook is going to dec the io pages
@@ -1006,41 +1013,40 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- extent_invalidatepage(tree, page, offset);
- btree_releasepage(page, GFP_NOFS);
- if (PagePrivate(page)) {
- btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
- "page private not zero on page %llu",
- (unsigned long long)page_offset(page));
- detach_page_private(page);
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_releasepage(&folio->page, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
}
}
-static int btree_set_page_dirty(struct page *page)
-{
#ifdef DEBUG
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
- u64 page_start = page_offset(page);
+ u64 page_start = folio_pos(folio);
if (fs_info->sectorsize == PAGE_SIZE) {
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
- return __set_page_dirty_nobuffers(page);
+ btrfs_assert_tree_write_locked(eb);
+ return filemap_dirty_folio(mapping, folio);
}
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
@@ -1061,23 +1067,25 @@ static int btree_set_page_dirty(struct page *page)
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
-#endif
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
- .invalidatepage = btree_invalidatepage,
+ .invalidate_folio = btree_invalidate_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
- .set_page_dirty = btree_set_page_dirty,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1125,7 +1133,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1140,11 +1148,16 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->fs_info = fs_info;
+ root->root_key.objectid = objectid;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
- root->orphan_cleanup_state = 0;
+ RB_CLEAR_NODE(&root->rb_node);
root->last_trans = 0;
root->free_objectid = 0;
@@ -1152,7 +1165,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
- root->block_rsv = NULL;
+
+ btrfs_init_root_block_rsv(root);
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1190,6 +1204,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
+ root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
@@ -1197,12 +1212,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
IO_TREE_LOG_CSUM_RANGE, NULL);
}
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- root->root_key.objectid = objectid;
- root->anon_dev = 0;
-
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
@@ -1242,6 +1251,102 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
}
#endif
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+ const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+ const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+ const struct btrfs_key *key = k;
+ const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *tmp;
+
+ write_lock(&fs_info->global_root_lock);
+ tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+ write_unlock(&fs_info->global_root_lock);
+ ASSERT(!tmp);
+
+ return tmp ? -EEXIST : 0;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ write_lock(&fs_info->global_root_lock);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key)
+{
+ struct rb_node *node;
+ struct btrfs_root *root = NULL;
+
+ read_lock(&fs_info->global_root_lock);
+ node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+ if (node)
+ root = container_of(node, struct btrfs_root, rb_node);
+ read_unlock(&fs_info->global_root_lock);
+
+ return root;
+}
+
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_CSUM_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_EXTENT_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1453,7 +1558,8 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = PTR_ERR(root->node);
root->node = NULL;
goto fail;
- } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
goto fail;
}
@@ -1500,7 +1606,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ !btrfs_is_data_reloc_root(root)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1554,25 +1660,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
if (objectid == BTRFS_ROOT_TREE_OBJECTID)
return btrfs_grab_root(fs_info->tree_root);
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
return btrfs_grab_root(fs_info->chunk_root);
if (objectid == BTRFS_DEV_TREE_OBJECTID)
return btrfs_grab_root(fs_info->dev_root);
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
return btrfs_grab_root(fs_info->quota_root) ?
fs_info->quota_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+ return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+ }
return NULL;
}
@@ -1619,6 +1733,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
#endif
}
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct rb_node *node;
+
+ while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+ root = rb_entry(node, struct btrfs_root, rb_node);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ btrfs_put_root(root);
+ }
+}
+
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
@@ -1630,20 +1756,20 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
- btrfs_put_root(fs_info->extent_root);
+ free_global_roots(fs_info);
btrfs_put_root(fs_info->tree_root);
btrfs_put_root(fs_info->chunk_root);
btrfs_put_root(fs_info->dev_root);
- btrfs_put_root(fs_info->csum_root);
btrfs_put_root(fs_info->quota_root);
btrfs_put_root(fs_info->uuid_root);
- btrfs_put_root(fs_info->free_space_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -1724,13 +1850,22 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
fail:
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsability to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1829,8 +1964,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_root *root = arg;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
int again;
while (1) {
@@ -1863,7 +1997,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(fs_info);
- again = btrfs_clean_one_deleted_snapshot(root);
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
/*
@@ -1926,7 +2060,8 @@ static int transaction_kthread(void *arg)
}
delta = ktime_get_seconds() - cur->start_time;
- if (cur->state < TRANS_STATE_COMMIT_START &&
+ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+ cur->state < TRANS_STATE_COMMIT_START &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
@@ -1953,8 +2088,7 @@ sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
- if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &fs_info->fs_state)))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
@@ -2023,11 +2157,30 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
- btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(info->extent_root->node));
- btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(info->extent_root->node));
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+ btrfs_set_backup_block_group_root(root_backup,
+ info->block_group_root->node->start);
+ btrfs_set_backup_block_group_root_gen(root_backup,
+ btrfs_header_generation(info->block_group_root->node));
+ btrfs_set_backup_block_group_root_level(root_backup,
+ btrfs_header_level(info->block_group_root->node));
+ } else {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
/*
* we might commit during log recovery, which happens before we set
@@ -2048,12 +2201,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
- btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(info->csum_root->node));
- btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(info->csum_root->node));
-
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
@@ -2127,7 +2274,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
- btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
@@ -2151,21 +2297,30 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
}
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(root, tmp,
+ &fs_info->global_root_tree,
+ rb_node)
+ free_root_extent_buffers(root);
+}
+
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
+ free_global_root_pointers(info);
free_root_extent_buffers(info->dev_root);
- free_root_extent_buffers(info->extent_root);
- free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
- free_root_extent_buffers(info->free_space_root);
}
void btrfs_put_root(struct btrfs_root *root)
@@ -2283,8 +2438,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->qgroup_rescan_lock);
}
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices)
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
@@ -2333,9 +2487,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue(fs_info, "readahead", flags,
- max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
@@ -2347,9 +2498,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers &&
+ fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2404,11 +2554,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = NULL;
btrfs_put_root(log_tree_root);
return ret;
- } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
}
+
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
@@ -2427,6 +2579,115 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return 0;
}
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name)
+{
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *root;
+ u64 max_global_id = 0;
+ int ret;
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ bool found = false;
+
+ /* If we have IGNOREDATACSUMS skip loading these roots. */
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+ btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ return 0;
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+ ret = 0;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != objectid)
+ break;
+ btrfs_release_path(path);
+
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
+ found = true;
+ root = read_tree_root_path(tree_root, path, &key);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = PTR_ERR(root);
+ break;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ ret = btrfs_global_root_insert(root);
+ if (ret) {
+ btrfs_put_root(root);
+ break;
+ }
+ key.offset++;
+ }
+ btrfs_release_path(path);
+
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
+ if (!found || ret) {
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = ret ? ret : -ENOENT;
+ else
+ ret = 0;
+ btrfs_err(fs_info, "failed to load root %s", name);
+ }
+ return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_EXTENT_TREE_OBJECTID, "extent");
+ if (ret)
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_CSUM_TREE_OBJECTID, "csum");
+ if (ret)
+ goto out;
+ if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -2436,7 +2697,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
BUG_ON(!fs_info->tree_root);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ ret = load_global_roots(tree_root);
+ if (ret)
+ return ret;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
@@ -2448,38 +2713,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
- }
-
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
btrfs_init_devices_late(fs_info);
- /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
- if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
- }
- }
-
/*
* This tree can share blocks with some other fs tree during relocation
* and we need a proper setup by btrfs_get_fs_root
@@ -2517,20 +2755,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
- }
- }
-
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2592,8 +2816,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
/*
* For 4K page size, we only support 4K sector size.
- * For 64K page size, we support read-write for 64K sector size, and
- * read-only for 4K sector size.
+ * For 64K page size, we support 64K and 4K sector sizes.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2770,6 +2993,56 @@ out:
return ret;
}
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ bytenr = btrfs_super_block_group_root(sb);
+ gen = btrfs_super_block_group_root_generation(sb);
+ level = btrfs_super_block_group_root_level(sb);
+ ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
+ if (ret)
+ btrfs_warn(fs_info, "couldn't read block group root");
+ return ret;
+}
+
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
@@ -2779,10 +3052,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
- u64 generation;
- int level;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ struct btrfs_root *root;
+ root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
+ GFP_KERNEL);
+ if (!root)
+ return -ENOMEM;
+ fs_info->block_group_root = root;
+ }
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
@@ -2807,29 +3087,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
}
- generation = btrfs_super_generation(sb);
- level = btrfs_super_root_level(sb);
- tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
- BTRFS_ROOT_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(tree_root->node)) {
- handle_error = true;
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- btrfs_warn(fs_info, "couldn't read tree root");
- continue;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
+ ret = load_important_roots(fs_info);
+ if (ret) {
handle_error = true;
- ret = -EIO;
- btrfs_warn(fs_info, "error while reading tree root");
continue;
}
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
@@ -2849,8 +3113,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
}
/* All successful */
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
+ fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
if (backup_index < 0) {
@@ -2883,12 +3148,16 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
+ rwlock_init(&fs_info->global_root_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2896,6 +3165,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -2914,9 +3184,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->global_root_tree = RB_ROOT;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2924,9 +3194,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
- /* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2948,7 +3215,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
- set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -2983,9 +3249,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
- spin_lock_init(&fs_info->send_reloc_lock);
- fs_info->send_in_progress = 0;
-
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
@@ -3135,7 +3398,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
up_read(&fs_info->cleanup_work_sem);
mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(fs_info->tree_root);
+ ret = btrfs_recover_relocation(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
@@ -3228,12 +3491,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
btrfs_init_btree_inode(fs_info);
- invalidate_bdev(fs_devices->latest_bdev);
+ invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
err = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3392,12 +3655,23 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- if (sectorsize != PAGE_SIZE) {
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- }
- if (sectorsize != PAGE_SIZE) {
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
@@ -3406,9 +3680,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
}
- ret = btrfs_init_workqueues(fs_info, fs_devices);
+ ret = btrfs_init_workqueues(fs_info);
if (ret) {
err = ret;
goto fail_sb_buffer;
@@ -3431,21 +3710,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
-
- chunk_root->node = read_tree_block(fs_info,
- btrfs_super_chunk_root(disk_super),
- BTRFS_CHUNK_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(chunk_root->node) ||
- !extent_buffer_uptodate(chunk_root->node)) {
+ ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
+ generation, level);
+ if (ret) {
btrfs_err(fs_info, "failed to read chunk root");
- if (!IS_ERR(chunk_root->node))
- free_extent_buffer(chunk_root->node);
- chunk_root->node = NULL;
goto fail_tree_roots;
}
- btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
- chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
offsetof(struct btrfs_header, chunk_tree_uuid),
@@ -3465,7 +3735,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_bdev) {
+ if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -3556,13 +3826,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_free_zone_cache(fs_info);
+
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
}
- fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
goto fail_sysfs;
@@ -3647,6 +3920,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ /* Kick the cleaner thread so it'll start deleting snapshots. */
+ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+
clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
@@ -3740,7 +4017,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
else if (ret)
return ERR_PTR(ret);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@ -3863,8 +4140,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to do I/O, so we don't lose the ability to do integrity
* checking.
*/
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, device->bdev);
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
@@ -3876,12 +4154,13 @@ static int write_dev_supers(struct btrfs_device *device,
* go down lazy and there's a short window where the on-disk
* copies might still contain the older version.
*/
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
- btrfs_advance_sb_log(device, i);
+
+ if (btrfs_advance_sb_log(device, i))
+ errors++;
}
return errors < i ? 0 : -1;
}
@@ -3968,16 +4247,26 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
- bio_reset(bio);
+ bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio_set_dev(bio, device->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
@@ -4221,7 +4510,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
if (root->reloc_root) {
btrfs_put_root(root->reloc_root);
@@ -4303,6 +4592,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
return btrfs_commit_transaction(trans);
}
+static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *trans;
+ struct btrfs_transaction *tmp;
+ bool found = false;
+
+ if (list_empty(&fs_info->trans_list))
+ return;
+
+ /*
+ * This function is only called at the very end of close_ctree(),
+ * thus no other running transaction, no need to take trans_lock.
+ */
+ ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
+ list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
+ struct extent_state *cached = NULL;
+ u64 dirty_bytes = 0;
+ u64 cur = 0;
+ u64 found_start;
+ u64 found_end;
+
+ found = true;
+ while (!find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ dirty_bytes += found_end + 1 - found_start;
+ cur = found_end + 1;
+ }
+ btrfs_warn(fs_info,
+ "transaction %llu (with %llu dirty metadata bytes) is not committed",
+ trans->transid, dirty_bytes);
+ btrfs_cleanup_one_transaction(trans, fs_info);
+
+ if (trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ list_del_init(&trans->list);
+
+ btrfs_put_transaction(trans);
+ trace_btrfs_transaction_commit(fs_info);
+ }
+ ASSERT(!found);
+}
+
void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -4316,6 +4647,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -4372,8 +4709,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -4412,7 +4748,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_stop_all_workers(fs_info);
/* We shouldn't have any transaction open at this point */
- ASSERT(list_empty(&fs_info->trans_list));
+ warn_about_uncommitted_trans(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
@@ -4470,7 +4806,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
@@ -4960,7 +5296,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
- trace_btrfs_transaction_commit(fs_info->tree_root);
+ trace_btrfs_transaction_commit(fs_info);
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0e7e9526b6a8..2e10514ecda8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -74,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
+int btrfs_global_root_insert(struct btrfs_root *root);
+void btrfs_global_root_delete(struct btrfs_root *root);
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key);
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -81,7 +84,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
@@ -106,6 +109,13 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
+static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 04083ee5ae6e..c3eb52dbe61c 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ab456cb4bf8..6aa92f84f465 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, start);
int ret;
struct btrfs_key key;
struct btrfs_path *path;
@@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
key.objectid = start;
key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
@@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags)
{
+ struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
@@ -153,7 +155,8 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ extent_root = btrfs_extent_root(fs_info, bytenr);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -171,7 +174,7 @@ search_again:
if (ret == 0) {
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (item_size >= sizeof(*ei)) {
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
@@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid,
u64 owner, u64 offset)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
@@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner,
u64 offset, int refs_to_add)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
u32 size;
@@ -593,8 +596,9 @@ fail:
}
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop, int *last_ref)
+ int refs_to_drop)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -626,8 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
num_refs -= refs_to_drop;
if (num_refs == 0) {
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
- *last_ref = 1;
+ ret = btrfs_del_item(trans, root, path);
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -685,7 +688,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -709,6 +712,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -721,8 +725,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
key.offset = root_objectid;
}
- ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
- path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path);
return ret;
}
@@ -787,7 +790,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 owner, u64 offset, int insert)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -865,7 +868,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
btrfs_print_v0_err(fs_info);
@@ -1007,7 +1010,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
__run_delayed_extent_op(extent_op, leaf, ei);
ptr = (unsigned long)ei + item_offset;
- end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
if (ptr < end - size)
memmove_extent_buffer(leaf, ptr + size, ptr,
end - size - ptr);
@@ -1068,8 +1071,7 @@ static noinline_for_stack
void update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op,
- int *last_ref)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_item *ei;
@@ -1117,9 +1119,8 @@ void update_inline_extent_backref(struct btrfs_path *path,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
- *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = (unsigned long)iref;
end = (unsigned long)ei + item_size;
if (ptr + size < end)
@@ -1162,8 +1163,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
}
return -EUCLEAN;
}
- update_inline_extent_backref(path, iref, refs_to_add,
- extent_op, NULL);
+ update_inline_extent_backref(path, iref, refs_to_add, extent_op);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
@@ -1174,23 +1174,20 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
}
static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data, int *last_ref)
+ int refs_to_drop, int is_data)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
- if (iref) {
- update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
- last_ref);
- } else if (is_data) {
- ret = remove_extent_data_ref(trans, path, refs_to_drop,
- last_ref);
- } else {
- *last_ref = 1;
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
- }
+ if (iref)
+ update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ else if (is_data)
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ else
+ ret = btrfs_del_item(trans, root, path);
return ret;
}
@@ -1266,7 +1263,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1310,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_bio *bbio = NULL;
-
+ struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
int i;
num_bytes = end - cur;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bbio, 0);
+ &num_bytes, &bioc, 0);
/*
* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1333,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
if (ret < 0)
goto out;
- stripe = bbio->stripes;
- for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ stripe = bioc->stripes;
+ for (i = 0; i < bioc->num_stripes; i++, stripe++) {
u64 bytes;
struct btrfs_device *device = stripe->dev;
@@ -1361,7 +1357,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* And since there are two loops, explicitly
* go to out to avoid confusion.
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
@@ -1372,7 +1368,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
*/
ret = 0;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
cur += num_bytes;
}
out:
@@ -1397,7 +1393,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1573,6 +1569,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -1602,8 +1599,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
key.offset = head->num_bytes;
}
+ root = btrfs_extent_root(fs_info, key.objectid);
again:
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
goto out;
@@ -1635,7 +1633,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
@@ -1845,8 +1843,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved) {
btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info->csum_root,
- head->bytenr, head->num_bytes);
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(fs_info, head->bytenr);
+ ret = btrfs_del_csums(trans, csum_root, head->bytenr,
+ head->num_bytes);
}
}
@@ -2286,7 +2287,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref;
struct btrfs_extent_inline_ref *iref;
@@ -2317,7 +2318,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
goto out;
ret = 1;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
/* If extent item has more than 1 inline ref then it's shared */
@@ -2376,7 +2377,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
out:
btrfs_free_path(path);
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
}
@@ -2438,10 +2439,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset);
- generic_ref.skip_qgroup = for_reloc;
+ key.offset, root->root_key.objectid,
+ for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2453,9 +2453,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
- generic_ref.skip_qgroup = for_reloc;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2759,12 +2758,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
- u64 to_add = len;
-
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- to_add = min(len, global_rsv->size -
- global_rsv->reserved);
+ u64 to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
@@ -2855,6 +2853,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool is_data)
+{
+ int ret;
+
+ if (is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(trans->fs_info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
/*
* Drop one or more refs of @node.
*
@@ -2923,7 +2950,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_root *extent_root = info->extent_root;
+ struct btrfs_root *extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -2936,9 +2963,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
- int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ extent_root = btrfs_extent_root(info, bytenr);
+ ASSERT(extent_root);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2999,9 +3028,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto err_dump;
}
/* Must be SHARED_* item, remove the backref first */
- ret = remove_extent_backref(trans, path, NULL,
- refs_to_drop,
- is_data, &last_ref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3071,7 +3099,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, extent_slot);
+ item_size = btrfs_item_size(leaf, extent_slot);
if (unlikely(item_size < sizeof(*ei))) {
ret = -EINVAL;
btrfs_print_v0_err(info);
@@ -3125,9 +3153,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
}
if (found_extent) {
- ret = remove_extent_backref(trans, path, iref,
- refs_to_drop, is_data,
- &last_ref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3172,7 +3199,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -3181,26 +3207,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- if (is_data) {
- ret = btrfs_del_csums(trans, info->csum_root, bytenr,
- num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- }
-
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
}
btrfs_release_path(path);
@@ -3278,20 +3285,20 @@ out_delayed_unlock:
}
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid);
+ root_id, 0, false);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
@@ -3301,7 +3308,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache;
bool must_pin = false;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
if (!ret) {
btrfs_redirty_list_add(trans->transaction, buf);
@@ -3373,9 +3380,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
@@ -3386,9 +3393,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -3476,7 +3483,9 @@ enum btrfs_extent_allocation_policy {
*/
struct find_free_extent_ctl {
/* Basic allocation info */
+ u64 ram_bytes;
u64 num_bytes;
+ u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
@@ -3495,6 +3504,9 @@ struct find_free_extent_ctl {
/* Allocation is called for tree-log */
bool for_treelog;
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
/* RAID index, converted from flags */
int index;
@@ -3756,8 +3768,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
u64 avail;
u64 bytenr = block_group->start;
u64 log_bytenr;
+ u64 data_reloc_bytenr;
int ret = 0;
- bool skip;
+ bool skip = false;
ASSERT(btrfs_is_zoned(block_group->fs_info));
@@ -3767,19 +3780,61 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
*/
spin_lock(&fs_info->treelog_bg_lock);
log_bytenr = fs_info->treelog_bg;
- skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
- (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
spin_unlock(&fs_info->treelog_bg_lock);
if (skip)
return 1;
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro ||
+ block_group->alloc_offset == block_group->zone_capacity) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!ret && !btrfs_zone_activate(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
+
+ if (ret)
+ goto out;
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
if (block_group->ro) {
ret = 1;
@@ -3796,7 +3851,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
goto out;
}
- avail = block_group->length - block_group->alloc_offset;
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
if (avail < num_bytes) {
if (ffe_ctl->max_extent_size < avail) {
/*
@@ -3813,6 +3879,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+
ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
@@ -3829,6 +3898,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
@@ -3897,6 +3969,28 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
}
}
+static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return true;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /*
+ * If we have enough free space left in an already
+ * active block group and we can't activate any other
+ * zone now, do not allow allocating a new chunk and
+ * let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return false;
+ return true;
+ default:
+ BUG();
+ }
+}
+
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
@@ -3925,25 +4019,25 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
bool full_search)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
ffe_ctl->orig_have_caching_bg = true;
- if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
- ffe_ctl->have_caching_bg)
- return 1;
-
- if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
- return 1;
-
if (ins->objectid) {
found_extent(ffe_ctl, ins);
return 0;
}
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
@@ -3972,6 +4066,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
int exist = 0;
+ /*Check if allocation policy allows to create a new chunk */
+ if (!can_allocate_chunk(fs_info, ffe_ctl))
+ return -ENOSPC;
+
trans = current->journal_info;
if (trans)
exist = 1;
@@ -3984,7 +4082,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
- CHUNK_ALLOC_FORCE);
+ CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
if (ret == -ENOSPC)
@@ -4085,6 +4183,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
}
+ if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
return 0;
default:
BUG();
@@ -4117,65 +4221,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_root *root,
- u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte_orig, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
- struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
- WARN_ON(num_bytes < fs_info->sectorsize);
-
- ffe_ctl.num_bytes = num_bytes;
- ffe_ctl.empty_size = empty_size;
- ffe_ctl.flags = flags;
- ffe_ctl.search_start = 0;
- ffe_ctl.delalloc = delalloc;
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
- ffe_ctl.have_caching_bg = false;
- ffe_ctl.orig_have_caching_bg = false;
- ffe_ctl.found_offset = 0;
- ffe_ctl.hint_byte = hint_byte_orig;
- ffe_ctl.for_treelog = for_treelog;
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
+ ffe_ctl->search_start = 0;
+ /* For clustered allocation */
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
/* For clustered allocation */
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- ffe_ctl.last_ptr = NULL;
- ffe_ctl.use_cluster = true;
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
if (btrfs_is_zoned(fs_info))
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
- space_info = btrfs_find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
return -ENOSPC;
}
- ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
- ffe_ctl.search_start = max(ffe_ctl.search_start,
- first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
- if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
- ffe_ctl.search_start);
+ ffe_ctl->search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -4183,7 +4284,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, flags) &&
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4197,9 +4298,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(
- block_group->flags);
- btrfs_lock_block_group(block_group, delalloc);
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -4207,31 +4309,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- ffe_ctl.have_caching_bg = false;
- if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
- ffe_ctl.index == 0)
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
- &space_info->block_groups[ffe_ctl.index], list) {
+ &space_info->block_groups[ffe_ctl->index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) {
- if (for_treelog)
+ if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
continue;
}
- btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->start;
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, flags)) {
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4346,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((flags & extra) && !(block_group->flags & extra))
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
goto loop;
/*
@@ -4250,14 +4354,14 @@ search:
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
continue;
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_done(block_group);
- if (unlikely(!ffe_ctl.cached)) {
- ffe_ctl.have_caching_bg = true;
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
/*
@@ -4280,10 +4384,11 @@ have_block_group:
goto loop;
bg_ret = NULL;
- ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
if (ret == 0) {
if (bg_ret && bg_ret != block_group) {
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
block_group = bg_ret;
}
} else if (ret == -EAGAIN) {
@@ -4293,46 +4398,49 @@ have_block_group:
}
/* Checks */
- ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
- fs_info->stripesize);
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (ffe_ctl.search_start + num_bytes >
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
block_group->start + block_group->length) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
- if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
- ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
- num_bytes, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = ffe_ctl.search_start;
- ins->offset = num_bytes;
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
- num_bytes);
- btrfs_release_block_group(block_group, delalloc);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
- release_block_group(block_group, &ffe_ctl, delalloc);
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4341,12 +4449,12 @@ loop:
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
- if (!ffe_ctl.max_extent_size)
- ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = ffe_ctl.max_extent_size;
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = ffe_ctl.max_extent_size;
+ ins->offset = ffe_ctl->max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
@@ -4404,16 +4512,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4431,8 +4551,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu tree-log %d",
- flags, num_bytes, for_treelog);
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4480,12 +4600,35 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
return ret;
}
+static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
+ if (ret) {
+ ASSERT(!ret);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ bytenr, num_bytes);
+ return ret;
+ }
+
+ trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
+ return 0;
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_inline_ref *iref;
@@ -4505,8 +4648,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- ins, size);
+ extent_root = btrfs_extent_root(fs_info, ins->objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -4539,18 +4682,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
- BUG();
- }
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
- return ret;
+ return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
@@ -4558,6 +4690,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_key extent_key;
@@ -4567,7 +4700,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes;
u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -4577,20 +4709,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
extent_key.offset = ref->level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
- num_bytes = fs_info->nodesize;
} else {
extent_key.offset = node->num_bytes;
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
- num_bytes = node->num_bytes;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- &extent_key, size);
+ extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
+ size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -4626,22 +4757,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, extent_key.objectid,
- num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, 1);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- extent_key.objectid, extent_key.offset);
- BUG();
- }
-
- trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
- fs_info->nodesize);
- return ret;
+ return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -4655,7 +4771,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4964,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -5265,7 +5382,8 @@ skip:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5386,7 +5504,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
@@ -5491,6 +5610,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
int ret;
int level;
bool root_dropped = false;
+ bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
@@ -5533,6 +5653,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
+ unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5708,6 +5830,13 @@ out_free:
btrfs_free_path(path);
out:
/*
+ * We were an unfinished drop root, check to see if there are any
+ * pending, and if not clear and wake up any waiters.
+ */
+ if (!err && unfinished_drop)
+ btrfs_maybe_wake_unfinished_drop(fs_info);
+
+ /*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
* keep trying to do the work later. This also cleans up roots if we
@@ -5750,13 +5879,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- btrfs_assert_tree_locked(parent);
+ btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
- btrfs_assert_tree_locked(node);
+ btrfs_assert_tree_write_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
@@ -5965,6 +6094,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
int dev_ret = 0;
int ret = 0;
+ if (range->start == U64_MAX)
+ return -EINVAL;
+
/*
* Check range overflow if range->len is set.
* The default range->len is U64_MAX.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aaddd7225348..33c19f51d79b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -12,7 +12,6 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
-#include <linux/cleancache.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "extent_io.h"
@@ -241,7 +240,7 @@ int __init extent_io_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
+ offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
goto free_buffer_cache;
@@ -1508,17 +1507,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
+ struct address_space *mapping = inode->i_mapping;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
- put_page(page);
- index++;
+ folio = filemap_get_folio(mapping, index);
+ filemap_dirty_folio(mapping, folio);
+ folio_account_redirty(folio);
+ index += folio_nr_pages(folio);
+ folio_put(folio);
}
}
@@ -1975,10 +1974,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes. @Start and @end are used to return the range,
+ * more than @max_bytes.
+ *
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
*
- * Return: true if we find something
- * false if nothing was in the tree
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1993,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@@ -1994,15 +2003,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
int ret;
int loops = 0;
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
- if (!found || delalloc_end <= *start) {
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
*start = delalloc_start;
- *end = delalloc_end;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
free_extent_state(cached_state);
return false;
}
@@ -2282,29 +2299,29 @@ int free_io_failure(struct extent_io_tree *failure_tree,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
- if (btrfs_is_zoned(fs_info))
- return btrfs_repair_one_zone(fs_info, logical);
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
@@ -2317,28 +2334,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bbio, 0);
+ &map_length, &bioc, 0);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- ASSERT(bbio->mirror_num == 1);
+ ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
+ &map_length, &bioc, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- BUG_ON(mirror_num != bbio->mirror_num);
+ BUG_ON(mirror_num != bioc->mirror_num);
}
- sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[bbio->mirror_num - 1].dev;
- btrfs_put_bbio(bbio);
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
@@ -2593,6 +2610,7 @@ static bool btrfs_check_repairable(struct inode *inode,
* a good copy of the failed sector and if we succeed, we have setup
* everything for repair_io_failure to do the rest for us.
*/
+ ASSERT(failed_mirror);
failrec->failed_mirror = failed_mirror;
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
@@ -2618,11 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
- struct btrfs_io_bio *repair_io_bio;
- blk_status_t status;
+ struct btrfs_bio *repair_bbio;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
@@ -2639,36 +2656,36 @@ int btrfs_repair_one_sector(struct inode *inode,
return -EIO;
}
- repair_bio = btrfs_io_bio_alloc(1);
- repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio = btrfs_bio_alloc(1);
+ repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
- if (failed_io_bio->csum) {
+ if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
- repair_io_bio->csum = repair_io_bio->csum_inline;
- memcpy(repair_io_bio->csum,
- failed_io_bio->csum + csum_size * icsum, csum_size);
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
}
bio_add_page(repair_bio, page, failrec->len, pgoff);
- repair_io_bio->logical = failrec->start;
- repair_io_bio->iter = repair_bio->bi_iter;
+ repair_bbio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
- status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
- failrec->bio_flags);
- if (status) {
- free_io_failure(failure_tree, tree, failrec);
- bio_put(repair_bio);
- }
- return blk_status_to_errno(status);
+ /*
+ * At this point we have a bio, so any errors from submit_bio_hook()
+ * will be handled by the endio on the repair_bio, so we can't return an
+ * error here.
+ */
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ return BLK_STS_OK;
}
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
- io_bio->mirror_num);
+ bbio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = io_bio->mirror_num;
+ mirror = bbio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode)) {
- error_bitmap = btrfs_verify_data_csum(io_bio,
+ error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
ret = error_bitmap;
} else {
- ret = btrfs_validate_metadata_buffer(io_bio,
+ ret = btrfs_validate_metadata_buffer(bbio,
page, start, end, mirror);
}
if (ret)
@@ -3052,6 +3069,14 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
/*
+ * If we failed to submit the IO at all we'll have a
+ * mirror_num == 0, in which case we need to just mark
+ * the page with an error and unlock it and carry on.
+ */
+ if (mirror == 0)
+ goto readpage_ok;
+
+ /*
* btrfs_submit_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
@@ -3070,9 +3095,6 @@ static void end_bio_extent_readpage(struct bio *bio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
- &eb->bflags))
- btree_readahead_hook(eb, -EIO);
}
readpage_ok:
if (likely(uptodate)) {
@@ -3106,7 +3128,7 @@ readpage_ok:
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
- btrfs_io_bio_free_csum(io_bio);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -3115,78 +3137,67 @@ readpage_ok:
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
* 'bio' because use of __GFP_ZERO is not supported.
*/
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
{
- memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
}
/*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail. We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
*/
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
- bio->bi_iter.bi_sector = first_byte >> 9;
- btrfs_io_bio_init(btrfs_io_bio(bio));
+ ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+ bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone(struct bio *bio)
{
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
- btrfs_bio = btrfs_io_bio(new);
- btrfs_io_bio_init(btrfs_bio);
- btrfs_bio->iter = bio->bi_iter;
+ new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(new);
+ btrfs_bio_init(bbio);
+ bbio->iter = bio->bi_iter;
return new;
}
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- /* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_io_bio_init(btrfs_bio);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio);
bio_trim(bio, offset >> 9, size >> 9);
- btrfs_bio->iter = bio->bi_iter;
+ bbio->iter = bio->bi_iter;
return bio;
}
/**
* Attempt to add a page to bio
*
- * @bio: destination bio
+ * @bio_ctrl: record both the bio, and its bio_flags
* @page: page to add to the bio
* @disk_bytenr: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @pg_offset: starting offset in the page
* @size: portion of page that we want to write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @pg_offset: starting offset in the page
* @bio_flags: flags of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
@@ -3276,8 +3287,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
else
bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
- if (!btrfs_is_zoned(fs_info) ||
- bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
}
@@ -3307,41 +3317,54 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED)
- bio = btrfs_bio_alloc(disk_bytenr);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
- bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
- bio->bi_write_hint = inode->vfs_inode.i_write_hint;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
- if (wbc) {
- struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- }
- if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
- device = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
-
- btrfs_io_bio(bio)->device = device;
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
@@ -3532,7 +3555,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR_OR_NULL(em)) {
+ if (em_cached && !IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -3561,7 +3584,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 cur_end;
struct extent_map *em;
int ret = 0;
- int nr = 0;
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
@@ -3575,15 +3597,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
goto out;
}
- if (!PageUptodate(page)) {
- if (cleancache_get_page(page) == 0) {
- BUG_ON(blocksize != PAGE_SIZE);
- unlock_extent(tree, start, end);
- unlock_page(page);
- goto out;
- }
- }
-
if (page->index == last_byte >> PAGE_SHIFT) {
size_t zero_offset = offset_in_page(last_byte);
@@ -3599,6 +3612,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
struct extent_state *cached = NULL;
@@ -3614,9 +3628,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
unlock_extent(tree, cur, end);
end_page_read(page, false, cur, end + 1 - cur);
+ ret = PTR_ERR(em);
break;
}
extent_offset = cur - em->start;
@@ -3727,9 +3742,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
end_bio_extent_readpage, 0,
this_bio_flag,
force_bio_submit);
- if (!ret) {
- nr++;
- } else {
+ if (ret) {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
goto out;
@@ -3776,18 +3789,20 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
- struct page *page, struct writeback_control *wbc,
- u64 delalloc_start, unsigned long *nr_written)
+ struct page *page, struct writeback_control *wbc)
{
- u64 page_end = delalloc_start + PAGE_SIZE - 1;
- bool found;
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
- u64 delalloc_end = 0;
+ /* How many pages are started by btrfs_run_delalloc_range() */
+ unsigned long nr_written = 0;
int ret;
int page_started = 0;
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
- while (delalloc_end < page_end) {
found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
@@ -3796,7 +3811,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
continue;
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, &page_started, nr_written, wbc);
+ delalloc_end, &page_started, &nr_written, wbc);
if (ret) {
btrfs_page_set_error(inode->root->fs_info, page,
page_offset(page), PAGE_SIZE);
@@ -3819,16 +3834,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
thresh);
}
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
+ /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
if (page_started) {
/*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
+ * We've unlocked the page, so we can't update the mapping's
+ * writeback index, just update nr_to_write.
*/
- wbc->nr_to_write -= *nr_written;
+ wbc->nr_to_write -= nr_written;
return 1;
}
@@ -3854,12 +3866,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
- unsigned long dirty_bitmap;
unsigned long flags;
- int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
- int range_start_bit = nbits;
+ int range_start_bit;
int range_end_bit;
/*
@@ -3872,13 +3883,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
return;
}
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
/* We should have the page locked, but just in case */
spin_lock_irqsave(&subpage->lock, flags);
- dirty_bitmap = subpage->dirty_bitmap;
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
- BTRFS_SUBPAGE_BITMAP_SIZE);
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
@@ -3896,7 +3912,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
- unsigned long nr_written,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -3915,7 +3930,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
- update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
}
@@ -3924,7 +3938,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, nr_written + 1);
+ update_nr_written(wbc, 1);
while (cur <= end) {
u64 disk_bytenr;
@@ -3956,7 +3970,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
@@ -4053,15 +4067,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned long nr_written = 0;
trace___extent_writepage(page, inode, wbc);
@@ -4073,8 +4088,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return 0;
}
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!epd->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
- &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
if (ret == 1)
return 0;
if (ret)
@@ -4099,7 +4113,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
- nr_written, &nr);
+ &nr);
if (ret == 1)
return 0;
@@ -4141,8 +4155,20 @@ done:
* capable of that.
*/
if (PageError(page))
- end_extent_writepage(page, ret, start, page_end);
- unlock_page(page);
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
+ }
ASSERT(ret <= 0);
return ret;
}
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
+ if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+ btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4285,6 +4314,20 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
+ * We need to set the mapping with the io error as well because a write
+ * error will flip the file system readonly, and then syncfs() will
+ * return a 0 because we are readonly if we don't modify the err seq for
+ * the superblock.
+ */
+ mapping_set_error(page->mapping, -EIO);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
@@ -4602,12 +4645,11 @@ static int submit_eb_subpage(struct page *page,
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
- const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < nbits) {
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
@@ -4623,7 +4665,8 @@ static int submit_eb_subpage(struct page *page,
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
@@ -4756,8 +4799,14 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
free_extent_buffer(eb);
return ret;
}
- if (cache)
+ if (cache) {
+ /*
+ * Implies write in zoned mode. Mark the last eb in a block group.
+ */
+ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+ set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
btrfs_put_block_group(cache);
+ }
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
@@ -4873,7 +4922,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (!BTRFS_FS_ERROR(fs_info)) {
ret = flush_write_bio(&epd);
} else {
ret = -EROFS;
@@ -5069,23 +5118,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
{
+ bool found_error = false;
+ int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
-
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 1,
- .sync_io = mode == WB_SYNC_ALL,
+ .sync_io = 1,
};
struct writeback_control wbc_writepages = {
- .sync_mode = mode,
- .nr_to_write = nr_pages * 2,
+ .sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
@@ -5093,33 +5147,49 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.no_cgroup_owner = 1,
};
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
- while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_SHIFT);
- if (clear_page_dirty_for_io(page))
- ret = __extent_writepage(page, &wbc_writepages, &epd);
- else {
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, true);
- unlock_page(page);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
}
put_page(page);
- start += PAGE_SIZE;
+ cur = cur_end + 1;
}
- ASSERT(ret <= 0);
- if (ret == 0)
+ if (!found_error)
ret = flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct inode *inode = mapping->host;
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
@@ -5127,7 +5197,13 @@ int extent_writepages(struct address_space *mapping,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
@@ -5163,17 +5239,17 @@ void extent_readahead(struct readahead_control *rac)
}
/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
+ * basic invalidate_folio code, this waits on any locked or writeback
+ * ranges corresponding to the folio, and then deletes any extent state
* records from the tree
*/
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset)
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset)
{
struct extent_state *cached_state = NULL;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ u64 start = folio_pos(folio);
+ u64 end = start + folio_size(folio) - 1;
+ size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -5183,7 +5259,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent_bits(tree, start, end, &cached_state);
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/*
* Currently for btree io tree, only EXTENT_LOCKED is utilized,
@@ -5335,7 +5411,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
break;
len = ALIGN(len, sectorsize);
em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR_OR_NULL(em))
+ if (IS_ERR(em))
return em;
/* if this isn't a hole return it */
@@ -6137,13 +6213,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- ret = btrfs_alloc_subpage(fs_info, &prealloc,
- BTRFS_SUBPAGE_METADATA);
- if (ret < 0) {
- unlock_page(p);
- put_page(p);
- exists = ERR_PTR(ret);
- goto free_eb;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
}
spin_lock(&mapping->private_lock);
@@ -6530,6 +6608,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ /*
+ * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
+ * operation, which could potentially still be in flight. In this case
+ * we simply want to return an error.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
+ return -EIO;
+
if (eb->fs_info->sectorsize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
@@ -6776,14 +6862,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ /*
+ * If we are using the commit root we could potentially clear a page
+ * Uptodate while we're using the extent buffer that we've previously
+ * looked up. We don't want to complain in this case, as the page was
+ * valid before, we just didn't write it out. Instead we want to catch
+ * the case where we didn't actually read the block properly, which
+ * would have !PageUptodate && !PageError, as we clear PageError before
+ * reading.
+ */
if (fs_info->sectorsize < PAGE_SIZE) {
- bool uptodate;
+ bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
- WARN_ON(!uptodate);
+ error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
+ WARN_ON(!uptodate && !error);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!PageUptodate(page) && !PageError(page));
}
}
@@ -7167,32 +7263,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
- int ret;
- int i;
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
- ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
lockdep_assert_held(&fs_info->buffer_lock);
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
- bytenr >> fs_info->sectorsize_bits,
- PAGE_SIZE / fs_info->nodesize);
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- break;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- break;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
}
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
+out:
return found;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 53abdc280451..151e9da5da2d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
+ EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -117,7 +118,7 @@ struct btrfs_bio_ctrl {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a8e02f7b6c7..6fee14ce2e6b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
free_extent_map(merge);
}
}
@@ -360,7 +362,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +377,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
@@ -490,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
@@ -504,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *new,
int modified)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e217337dff9..d2fa32ffe304 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -25,6 +25,8 @@ enum {
EXTENT_FLAG_FILLING,
/* filesystem extent mapping type */
EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
};
struct extent_map {
@@ -40,6 +42,12 @@ struct extent_map {
u64 ram_bytes;
u64 block_start;
u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
u64 generation;
unsigned long flags;
/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 0b9401a5afd3..c828f971a346 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
csum_offset = (bytenr - found_key.offset) >>
fs_info->sectorsize_bits;
- csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ csums_in_item = btrfs_item_size(leaf, path->slots[0]);
csums_in_item /= csum_size;
if (csum_offset == csums_in_item) {
@@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 disk_bytenr,
u64 len, u8 *dst)
{
+ struct btrfs_root *csum_root;
struct btrfs_csum_item *item = NULL;
struct btrfs_key key;
const u32 sectorsize = fs_info->sectorsize;
@@ -274,7 +275,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
@@ -285,13 +286,14 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
/* Current item doesn't contain the desired range, search again */
btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
@@ -303,7 +305,7 @@ found:
read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
ret * csum_size);
out:
- if (ret == -ENOENT)
+ if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
return ret;
}
@@ -358,7 +360,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
- * btrfs_io_bio(bio)->csum instead.
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -366,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_bio *bbio = NULL;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
@@ -375,8 +378,10 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
u8 *csum;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
+ blk_status_t ret = BLK_STS_OK;
- if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
/*
@@ -397,19 +402,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+ bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
- GFP_NOFS);
- if (!btrfs_bio->csum) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
- btrfs_bio->csum = btrfs_bio->csum_inline;
+ bbio->csum = bbio->csum_inline;
}
- csum = btrfs_bio->csum;
+ csum = bbio->csum;
} else {
csum = dst;
}
@@ -454,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
- if (count <= 0) {
- /*
- * Either we hit a critical error or we didn't find
- * the csum.
- * Either way, we put zero into the csums dst, and skip
- * to the next sector.
- */
+ if (count < 0) {
+ ret = errno_to_blk_status(count);
+ if (bbio)
+ btrfs_bio_free_csum(bbio);
+ break;
+ }
+
+ /*
+ * We didn't find a csum for this range. We need to make sure
+ * we complain loudly about this, because we are not NODATASUM.
+ *
+ * However for the DATA_RELOC inode we could potentially be
+ * relocating data extents for a NODATASUM inode, so the inode
+ * itself won't be marked with NODATASUM, but the extent we're
+ * copying is in fact NODATASUM. If we don't find a csum we
+ * assume this is the case.
+ */
+ if (count == 0) {
memset(csum_dst, 0, csum_size);
count = 1;
- /*
- * For data reloc inode, we need to mark the range
- * NODATASUM so that balance won't report false csum
- * error.
- */
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset;
@@ -489,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
btrfs_free_path(path);
- return BLK_STS_OK;
+ return ret;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -535,7 +545,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
key.type == BTRFS_EXTENT_CSUM_KEY) {
offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
- btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
@@ -560,7 +570,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
- size = btrfs_item_size_nr(leaf, path->slots[0]);
+ size = btrfs_item_size(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
if (csum_end <= start) {
path->slots[0]++;
@@ -610,32 +620,33 @@ fail:
return ret;
}
-/*
- * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+/**
+ * Calculate checksums of the data contained inside a bio
+ *
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
- * @file_start: offset in file this bio begins to describe
- * @contig: Boolean. If true/1 means all bio vecs in this bio are
- * contiguous and they begin at @file_start in the file. False/0
- * means this bio can contain potentially discontiguous bio vecs
- * so the logical offset of each should be calculated separately.
+ * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
+ * file offsets are determined from the page offsets in the bio.
+ * Otherwise, this is the starting file offset of the bio vecs in
+ * @bio, which must be contiguous.
+ * @one_ordered: If true, @bio only refers to one ordered extent.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig)
+ u64 offset, bool one_ordered)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
+ const bool use_page_offsets = (offset == (u64)-1);
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
- int nr_sectors;
+ unsigned int blockcount;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
int i;
- u64 offset;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -649,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
- if (contig)
- offset = file_start;
- else
- offset = 0; /* shut up gcc */
-
sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (!contig)
+ if (use_page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
@@ -679,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
}
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
+ blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
- for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->num_bytes ||
- offset < ordered->file_offset) {
+ for (i = 0; i < blockcount; i++) {
+ if (!one_ordered &&
+ !in_range(offset, ordered->file_offset,
+ ordered->num_bytes)) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
@@ -709,12 +716,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
index = 0;
}
- data = kmap_atomic(bvec.bv_page);
- crypto_shash_digest(shash, data + bvec.bv_offset
- + (i * fs_info->sectorsize),
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
fs_info->sectorsize,
sums->sums + index);
- kunmap_atomic(data);
+ kunmap_local(data);
index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
@@ -751,7 +758,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key->offset;
@@ -802,7 +809,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root == fs_info->csum_root ||
+ ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
@@ -835,7 +842,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
if (key.offset >= end_byte)
break;
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key.offset;
@@ -1003,7 +1010,7 @@ again:
item_end = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
@@ -1014,7 +1021,7 @@ again:
u32 item_size;
/* we found one, but it isn't big enough yet */
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if ((item_size / csum_size) >=
MAX_CSUM_ITEMS(fs_info, csum_size)) {
/* already at max size, make a new one */
@@ -1071,7 +1078,7 @@ again:
}
extend_csum:
- if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
csum_size) {
int extend_nr;
u64 tmp;
@@ -1126,7 +1133,7 @@ extend_csum:
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
- diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+ diff = diff - btrfs_item_size(leaf, path->slots[0]);
diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
diff /= csum_size;
diff *= csum_size;
@@ -1163,7 +1170,7 @@ insert:
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((unsigned char *)item +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
@@ -1209,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
extent_start = key.offset;
extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a1762363f61f..380054c94e4b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -50,11 +50,14 @@ struct inode_defrag {
/* root objectid */
u64 root;
- /* last offset we were able to defrag */
- u64 last_offset;
-
- /* if we've wrapped around back to zero once already */
- int cycled;
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents,
+ * thus needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
- if (defrag->last_offset > entry->last_offset)
- entry->last_offset = defrag->last_offset;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
return -EEXIST;
}
}
@@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
}
/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- goto out;
-
- /*
- * Here we don't check the IN_DEFRAG flag, because we need merge
- * them together.
- */
- spin_lock(&fs_info->defrag_inodes_lock);
- ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&fs_info->defrag_inodes_lock);
- if (ret)
- goto out;
- return;
-out:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
-/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
@@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
- int num_defrag;
- int ret;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
@@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
goto cleanup;
}
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
- range.start = defrag->last_offset;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == BTRFS_DEFRAG_BATCH) {
- defrag->last_offset = range.start;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
-
iput(inode);
- return 0;
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
@@ -437,9 +410,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
{
size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
@@ -447,7 +426,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- ClearPageChecked(pages[i]);
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -504,7 +484,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct page *p = pages[i];
btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- ClearPageChecked(p);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
@@ -711,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int modify_tree = -1;
int update_refs;
int found = 0;
- int leafs_visited = 0;
struct btrfs_path *path = args->path;
args->bytes_found = 0;
@@ -749,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path->slots[0]--;
}
ret = 0;
- leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -761,7 +739,6 @@ next_slot:
ret = 0;
break;
}
- leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
@@ -869,7 +846,8 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- args->start - extent_offset);
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
@@ -955,7 +933,8 @@ delete_extent_item:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
- key.offset - extent_offset);
+ key.offset - extent_offset, 0,
+ false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
args->bytes_found += extent_end - key.offset;
@@ -1005,7 +984,7 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && args->replace_extent && leafs_visited == 1 &&
+ if (!ret && args->replace_extent &&
path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + args->extent_item_size) {
@@ -1020,8 +999,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &args->extent_item_size, 1);
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
args->extent_inserted = true;
}
@@ -1232,7 +1210,7 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset);
+ orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1235,8 @@ again:
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1709,7 +1688,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
ret = -EFAULT;
break;
}
@@ -1740,7 +1719,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes,
+ reserve_bytes);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1844,7 +1824,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
break;
}
@@ -1852,7 +1832,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
cond_resched();
@@ -1903,16 +1883,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
+ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
ssize_t written = 0;
ssize_t written_buffered;
+ size_t prev_left = 0;
loff_t endbyte;
ssize_t err;
unsigned int ilock_flags = 0;
- struct iomap_dio *dio = NULL;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1955,23 +1936,80 @@ relock:
goto buffered;
}
- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- 0);
+ /*
+ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
+ * calls generic_write_sync() (through iomap_dio_complete()), because
+ * that results in calling fsync (btrfs_sync_file()) which will try to
+ * lock the inode in exclusive/write mode.
+ */
+ if (is_sync_write)
+ iocb->ki_flags &= ~IOCB_DSYNC;
- btrfs_inode_unlock(inode, ilock_flags);
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+again:
+ from->nofault = true;
+ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, written);
+ from->nofault = false;
- if (IS_ERR_OR_NULL(dio)) {
- err = PTR_ERR_OR_ZERO(dio);
- if (err < 0 && err != -ENOTBLK)
- goto out;
- } else {
- written = iomap_dio_complete(dio);
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (err > 0)
+ written = err;
+
+ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ err = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto again;
+ }
}
- if (written < 0 || !iov_iter_count(from)) {
- err = written;
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ /*
+ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
+ * the fsync (call generic_write_sync()).
+ */
+ if (is_sync_write)
+ iocb->ki_flags |= IOCB_DSYNC;
+
+ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
+ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
- }
buffered:
pos = iocb->ki_pos;
@@ -1996,15 +2034,46 @@ buffered:
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
- return written ? written : err;
+ return err < 0 ? err : written;
}
-static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
- struct iov_iter *from)
+static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t count;
+ ssize_t ret;
+
+ btrfs_inode_lock(inode, 0);
+ count = encoded->len;
+ ret = generic_write_checks_count(iocb, &count);
+ if (ret == 0 && count != encoded->len) {
+ /*
+ * The write got truncated by generic_write_checks_count(). We
+ * can't do a partial encoded write.
+ */
+ ret = -EFBIG;
+ }
+ if (ret || encoded->len == 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, from, encoded->len);
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_do_encoded_write(iocb, from, encoded);
+out:
+ btrfs_inode_unlock(inode, 0);
+ return ret;
+}
+
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
{
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
- ssize_t num_written = 0;
+ ssize_t num_written, num_sync;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
/*
@@ -2012,25 +2081,31 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EOPNOTSUPP;
if (sync)
atomic_inc(&inode->sync_writers);
- if (iocb->ki_flags & IOCB_DIRECT)
- num_written = btrfs_direct_write(iocb, from);
- else
- num_written = btrfs_buffered_write(iocb, from);
+ if (encoded) {
+ num_written = btrfs_encoded_write(iocb, from, encoded);
+ num_sync = encoded->len;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = num_sync = btrfs_direct_write(iocb, from);
+ } else {
+ num_written = num_sync = btrfs_buffered_write(iocb, from);
+ }
btrfs_set_inode_last_sub_trans(inode);
- if (num_written > 0)
- num_written = generic_write_sync(iocb, num_written);
+ if (num_sync > 0) {
+ num_sync = generic_write_sync(iocb, num_sync);
+ if (num_sync < 0)
+ num_written = num_sync;
+ }
if (sync)
atomic_dec(&inode->sync_writers);
@@ -2039,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
return num_written;
}
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return btrfs_do_write_iter(iocb, from, NULL);
+}
+
int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
@@ -2434,7 +2514,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
hole_em->len = end - offset;
@@ -2455,8 +2535,7 @@ out:
} while (ret == -EEXIST);
free_extent_map(hole_em);
if (ret)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
}
return 0;
@@ -2620,7 +2699,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset);
+ btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2810,7 +2889,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
if (ret)
goto out_trans;
@@ -2878,8 +2957,9 @@ out:
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2911,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3351,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
+ return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
@@ -3373,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
@@ -3650,6 +3738,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
ssize_t ret;
if (fsverity_active(inode))
@@ -3659,9 +3749,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- return ret;
+ return ret < 0 ? ret : read;
}
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da0eee7c9e5f..01a408db5683 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,8 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "subpage.h"
+#include "inode-item.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -36,7 +38,7 @@ struct btrfs_trim_range {
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info);
+ struct btrfs_free_space *info, bool update_stat);
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes, bool for_alloc);
@@ -44,7 +46,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
- u64 bytes);
+ u64 bytes, bool update_stats);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -287,9 +289,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct inode *inode)
+ struct inode *vfs_inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(vfs_inode),
+ .new_size = 0,
+ .ino = btrfs_ino(BTRFS_I(vfs_inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
int ret = 0;
bool locked = false;
@@ -319,19 +330,26 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
}
- btrfs_i_size_write(BTRFS_I(inode), 0);
- truncate_pagecache(inode, 0);
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(vfs_inode, 0);
+
+ lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
/*
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, BTRFS_EXTENT_DATA_KEY, NULL);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
+
+ unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
fail:
if (locked)
@@ -411,7 +429,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- ClearPageChecked(io_ctl->pages[i]);
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
@@ -662,7 +683,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->block_group;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
@@ -868,7 +889,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
ret = btrfs_add_free_space(block_group, info->offset,
info->bytes);
kmem_cache_free(btrfs_free_space_cachep, info);
@@ -882,7 +903,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
bytes);
if (ret)
break;
- bitmap_clear_bits(ctl, info, offset, bytes);
+ bitmap_clear_bits(ctl, info, offset, bytes, true);
offset = info->offset;
bytes = ctl->unit;
}
@@ -1577,6 +1598,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
}
/*
+ * This is a little subtle. We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0. In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes. So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have. For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
+ * call into find_free_space(), and return with max_extent_size == 4K, because
+ * that first bitmap entry had ->max_extent_size set, but the second one did
+ * not. If instead we returned 8K we'd come in searching for 8K, and find the
+ * 8K contiguous range.
+ *
+ * Consider the other case, we have 2 8K chunks in that second entry and still
+ * don't have ->max_extent_size set. We'll return 16K, and the next time the
+ * allocator comes in it'll fully search our second bitmap, and this time it'll
+ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
+ * right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+ if (entry->bitmap && entry->max_extent_size)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct btrfs_free_space *entry, *exist;
+
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
+/*
* searches the tree for the given offset.
*
* fuzzy - If this is set, then we are trying to make an allocation, and we just
@@ -1588,15 +1653,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
u64 offset, int bitmap_only, int fuzzy)
{
struct rb_node *n = ctl->free_space_offset.rb_node;
- struct btrfs_free_space *entry, *prev = NULL;
+ struct btrfs_free_space *entry = NULL, *prev = NULL;
/* find entry that is closest to the 'offset' */
- while (1) {
- if (!n) {
- entry = NULL;
- break;
- }
-
+ while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
prev = entry;
@@ -1606,6 +1666,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
n = n->rb_right;
else
break;
+
+ entry = NULL;
}
if (bitmap_only) {
@@ -1682,6 +1744,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
return NULL;
while (1) {
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
if (entry->offset + BITS_PER_BITMAP *
ctl->unit > offset)
@@ -1690,33 +1756,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
if (entry->offset + entry->bytes > offset)
break;
}
-
- n = rb_next(&entry->offset_index);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_free_space, offset_index);
}
return entry;
}
-static inline void
-__unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
+static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
{
rb_erase(&info->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
ctl->free_extents--;
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
}
-}
-static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- __unlink_free_space(ctl, info);
- ctl->free_space -= info->bytes;
+ if (update_stat)
+ ctl->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_free_space_ctl *ctl,
@@ -1730,6 +1788,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
if (ret)
return ret;
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]++;
ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1740,9 +1800,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
return ret;
}
-static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info,
- u64 offset, u64 bytes)
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ ASSERT(info->bitmap);
+
+ /*
+ * If our entry is empty it's because we're on a cluster and we don't
+ * want to re-link it into our ctl bytes index.
+ */
+ if (RB_EMPTY_NODE(&info->bytes_index))
+ return;
+
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+}
+
+static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes, bool update_stat)
{
unsigned long start, count, end;
int extent_delta = -1;
@@ -1758,6 +1834,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta++;
@@ -1769,14 +1847,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
}
-}
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
-{
- __bitmap_clear_bits(ctl, info, offset, bytes);
- ctl->free_space -= bytes;
+ if (update_stat)
+ ctl->free_space -= bytes;
}
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
@@ -1793,9 +1866,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
bitmap_set(info->bitmap, start, count);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
info->bytes += bytes;
ctl->free_space += bytes;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta--;
@@ -1863,20 +1943,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
*bytes = (u64)(max_bits) * ctl->unit;
bitmap_info->max_extent_size = *bytes;
+ relink_bitmap_entry(ctl, bitmap_info);
return -1;
}
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
- if (entry->bitmap)
- return entry->max_extent_size;
- return entry->bytes;
-}
-
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
- unsigned long align, u64 *max_extent_size)
+ unsigned long align, u64 *max_extent_size, bool use_bytes_index)
{
struct btrfs_free_space *entry;
struct rb_node *node;
@@ -1886,16 +1960,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (!ctl->free_space_offset.rb_node)
goto out;
+again:
+ if (use_bytes_index) {
+ node = rb_first_cached(&ctl->free_space_bytes);
+ } else {
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+ 0, 1);
+ if (!entry)
+ goto out;
+ node = &entry->offset_index;
+ }
- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
- if (!entry)
- goto out;
+ for (; node; node = rb_next(node)) {
+ if (use_bytes_index)
+ entry = rb_entry(node, struct btrfs_free_space,
+ bytes_index);
+ else
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
- for (node = &entry->offset_index; node; node = rb_next(node)) {
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ /*
+ * If we are using the bytes index then all subsequent entries
+ * in this tree are going to be < bytes, so simply set the max
+ * extent size and exit the loop.
+ *
+ * If we're using the offset index then we need to keep going
+ * through the rest of the tree.
+ */
if (entry->bytes < *bytes) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
+ if (use_bytes_index)
+ break;
continue;
}
@@ -1912,6 +2008,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
tmp = entry->offset;
}
+ /*
+ * We don't break here if we're using the bytes index because we
+ * may have another entry that has the correct alignment that is
+ * the right size, so we don't want to miss that possibility.
+ * At worst this adds another loop through the logic, but if we
+ * broke here we could prematurely ENOSPC.
+ */
if (entry->bytes < *bytes + align_off) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
@@ -1919,6 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bitmap) {
+ struct rb_node *old_next = rb_next(node);
u64 size = *bytes;
ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1931,6 +2035,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
max(get_max_extent_size(entry),
*max_extent_size);
}
+
+ /*
+ * The bitmap may have gotten re-arranged in the space
+ * index here because the max_extent_size may have been
+ * updated. Start from the beginning again if this
+ * happened.
+ */
+ if (use_bytes_index && old_next != rb_next(node))
+ goto again;
continue;
}
@@ -1969,7 +2082,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
}
- unlink_free_space(ctl, bitmap_info);
+ unlink_free_space(ctl, bitmap_info, true);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
@@ -2007,7 +2120,7 @@ again:
/* Cannot clear past the end of the bitmap */
search_bytes = min(search_bytes, end - search_start + 1);
- bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
*offset += search_bytes;
*bytes -= search_bytes;
@@ -2079,12 +2192,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
- /*
- * We set some bytes, we have no idea what the max extent size is
- * anymore.
- */
- info->max_extent_size = 0;
-
return bytes_to_set;
}
@@ -2092,7 +2199,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2161,7 +2268,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
return 0;
if (ctl->op == &free_space_op)
- block_group = ctl->private;
+ block_group = ctl->block_group;
again:
/*
* Since we link bitmaps right into the cluster we need to see if we
@@ -2306,10 +2413,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
/* See try_merge_free_space() comment. */
if (right_info && !right_info->bitmap &&
(!is_trimmed || btrfs_free_space_trimmed(right_info))) {
- if (update_stat)
- unlink_free_space(ctl, right_info);
- else
- __unlink_free_space(ctl, right_info);
+ unlink_free_space(ctl, right_info, update_stat);
info->bytes += right_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, right_info);
merged = true;
@@ -2319,10 +2423,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (left_info && !left_info->bitmap &&
left_info->offset + left_info->bytes == offset &&
(!is_trimmed || btrfs_free_space_trimmed(left_info))) {
- if (update_stat)
- unlink_free_space(ctl, left_info);
- else
- __unlink_free_space(ctl, left_info);
+ unlink_free_space(ctl, left_info, update_stat);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, left_info);
@@ -2358,10 +2459,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, end, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, end, bytes);
+ bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2415,10 +2513,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2462,12 +2557,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
+int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
int ret = 0;
u64 filter_bytes = bytes;
@@ -2482,6 +2577,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
info->bytes = bytes;
info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
+ RB_CLEAR_NODE(&info->bytes_index);
spin_lock(&ctl->tree_lock);
@@ -2539,10 +2635,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
to_free = size;
else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2667,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
spin_unlock(&block_group->lock);
}
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
- block_group->zone_unusable >=
- div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2589,9 +2694,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
@@ -2622,9 +2725,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -2683,7 +2784,7 @@ again:
re_search = false;
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
if (offset == info->offset) {
u64 to_free = min(bytes, info->bytes);
@@ -2719,7 +2820,7 @@ again:
}
spin_unlock(&ctl->tree_lock);
- ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+ ret = __btrfs_add_free_space(block_group,
offset + bytes,
old_end - (offset + bytes),
info->trim_state);
@@ -2754,8 +2855,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
* out the free space after the allocation offset.
*/
if (btrfs_is_zoned(fs_info)) {
- btrfs_info(fs_info, "free space %llu",
- block_group->length - block_group->alloc_offset);
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ block_group->zone_is_active);
return;
}
@@ -2783,8 +2885,9 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
ctl->start = block_group->start;
- ctl->private = block_group;
+ ctl->block_group = block_group;
ctl->op = &free_space_op;
+ ctl->free_space_bytes = RB_ROOT_CACHED;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
@@ -2850,6 +2953,8 @@ static void __btrfs_return_cluster_to_free_space(
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
+ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+ entry_less);
}
cluster->root = RB_ROOT;
spin_unlock(&cluster->lock);
@@ -2865,7 +2970,7 @@ static void __btrfs_remove_free_space_cache_locked(
while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
info = rb_entry(node, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
kmem_cache_free(btrfs_free_space_cachep, info);
} else {
free_bitmap(ctl, info);
@@ -2879,8 +2984,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->private)
- btrfs_discard_update_discardable(ctl->private);
+ if (ctl->block_group)
+ btrfs_discard_update_discardable(ctl->block_group);
spin_unlock(&ctl->tree_lock);
}
@@ -2951,18 +3056,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 align_gap = 0;
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ bool use_bytes_index = (offset == block_group->start);
ASSERT(!btrfs_is_zoned(block_group->fs_info));
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
- block_group->full_stripe_len, max_extent_size);
+ block_group->full_stripe_len, max_extent_size,
+ use_bytes_index);
if (!entry)
goto out;
ret = offset;
if (entry->bitmap) {
- bitmap_clear_bits(ctl, entry, offset, bytes);
+ bitmap_clear_bits(ctl, entry, offset, bytes, true);
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
@@ -2970,7 +3077,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
align_gap_trim_state = entry->trim_state;
@@ -2992,8 +3099,7 @@ out:
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
- __btrfs_add_free_space(block_group->fs_info, ctl,
- align_gap, align_gap_len,
+ __btrfs_add_free_space(block_group, align_gap, align_gap_len,
align_gap_trim_state);
return ret;
}
@@ -3064,7 +3170,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
}
ret = search_start;
- __bitmap_clear_bits(ctl, entry, ret, bytes);
+ bitmap_clear_bits(ctl, entry, ret, bytes, false);
return ret;
}
@@ -3240,6 +3346,17 @@ again:
cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+ /*
+ * We need to know if we're currently on the normal space index when we
+ * manipulate the bitmap so that we know we need to remove and re-insert
+ * it into the space_index tree. Clear the bytes_index node here so the
+ * bitmap manipulation helpers know not to mess with the space_index
+ * until this bitmap entry is added back into the normal cache.
+ */
+ RB_CLEAR_NODE(&entry->bytes_index);
+
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3330,6 +3447,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
total_size += entry->bytes;
@@ -3521,13 +3639,13 @@ static int do_trimming(struct btrfs_block_group *block_group,
mutex_lock(&ctl->cache_writeout_mutex);
if (reserved_start < start)
- __btrfs_add_free_space(fs_info, ctl, reserved_start,
+ __btrfs_add_free_space(block_group, reserved_start,
start - reserved_start,
reserved_trim_state);
if (start + bytes < reserved_start + reserved_bytes)
- __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+ __btrfs_add_free_space(block_group, end, reserved_end - end,
reserved_trim_state);
- __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
+ __btrfs_add_free_space(block_group, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3601,7 +3719,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
/*
* Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
* If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
@@ -3627,7 +3745,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
kmem_cache_free(btrfs_free_space_cachep, entry);
}
@@ -3814,7 +3932,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
bytes > (max_discard_size + minlen))
bytes = max_discard_size;
- bitmap_clear_bits(ctl, entry, start, bytes);
+ bitmap_clear_bits(ctl, entry, start, bytes, true);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088d43f9..15591b299895 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
struct btrfs_free_space {
struct rb_node offset_index;
+ struct rb_node bytes_index;
u64 offset;
u64 bytes;
u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
+ struct rb_root_cached free_space_bytes;
u64 free_space;
int extents_thresh;
int free_extents;
@@ -54,7 +56,7 @@ struct btrfs_free_space_ctl {
s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
- void *private;
+ struct btrfs_block_group *block_group;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
};
@@ -101,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
- u64 bytenr, u64 size,
- enum btrfs_trim_state trim_state);
+int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
+ u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a33bca94d133..0ae54d8c10d6 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -16,6 +16,20 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+static struct btrfs_root *btrfs_free_space_root(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
+ key.offset = block_group->global_root_id;
+ return btrfs_global_root(block_group->fs_info, &key);
+}
+
void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
@@ -51,7 +65,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -85,7 +99,7 @@ struct btrfs_free_space_info *search_free_space_info(
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
int ret;
@@ -188,7 +202,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -326,7 +340,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -586,7 +600,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
- struct btrfs_root *root = block_group->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
@@ -699,7 +713,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
@@ -851,7 +865,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
@@ -1046,7 +1060,7 @@ out:
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *extent_root = trans->fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path, *path2;
struct btrfs_key key;
u64 start, end;
@@ -1080,6 +1094,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
+ extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
@@ -1157,7 +1172,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
ret = PTR_ERR(free_space_root);
goto abort;
}
- fs_info->free_space_root = free_space_root;
+ ret = btrfs_global_root_insert(free_space_root);
+ if (ret) {
+ btrfs_put_root(free_space_root);
+ goto abort;
+ }
node = rb_first(&fs_info->block_group_cache_tree);
while (node) {
@@ -1232,7 +1251,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *free_space_root = fs_info->free_space_root;
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
int ret;
trans = btrfs_start_transaction(tree_root, 0);
@@ -1241,7 +1265,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
- fs_info->free_space_root = NULL;
ret = clear_free_space_tree(trans, free_space_root);
if (ret)
@@ -1251,13 +1274,14 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto abort;
+ btrfs_global_root_delete(free_space_root);
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
btrfs_clean_tree_block(free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
- 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
@@ -1319,7 +1343,7 @@ out:
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -1410,7 +1434,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
@@ -1488,7 +1512,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 37f36ffdaf6b..0eeb5ea87894 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,6 +4,7 @@
*/
#include "ctree.h"
+#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
@@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
u32 cur_offset = 0;
int len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
@@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
u32 cur_offset = 0;
int ref_name_len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
/*
@@ -139,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_extref_index(leaf, extref);
@@ -208,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_ref_index(leaf, ref);
@@ -256,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_item *item;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
@@ -282,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
goto out;
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
- ptr += btrfs_item_size(leaf, item) - ins_len;
+ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
@@ -332,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ref)
goto out;
- old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
btrfs_extend_item(path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
@@ -419,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
}
return ret;
}
+
+static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
+ struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 offset, int extent_type, int slot)
+{
+ if (!inode)
+ return;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
+ offset);
+ else
+ trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
+}
+
+/*
+ * Remove inode items from a given root.
+ *
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @control: The btrfs_truncate_control to control how and what we
+ * are truncating.
+ *
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 new_size = control->new_size;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u32 found_type = (u8)-1;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int ret;
+ u64 bytes_deleted = 0;
+ bool be_nice = false;
+
+ ASSERT(control->inode || !control->clear_extent_range);
+ ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
+
+ control->last_size = new_size;
+ control->sub_bytes = 0;
+
+ /*
+ * For shareable roots we want to back off from time to time, this turns
+ * out to be subvolume roots, reloc roots, and data reloc roots.
+ */
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ be_nice = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_BACK;
+
+ key.objectid = control->ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ /*
+ * With a 16K leaf size and 128MiB extents, you can actually queue up a
+ * huge file in a single leaf. Most of the time that bytes_deleted is
+ * > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = 0;
+ /* There are no items in the tree for us to truncate, we're done */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ u64 clear_start = 0, clear_len = 0, extent_start = 0;
+ bool should_throttle = false;
+
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ if (found_key.objectid != control->ino)
+ break;
+
+ if (found_type < control->min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE)
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ item_end += btrfs_file_extent_ram_bytes(leaf, fi);
+
+ btrfs_trace_truncate(control->inode, leaf, fi,
+ found_key.offset, extent_type,
+ path->slots[0]);
+ item_end--;
+ }
+ if (found_type > control->min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ control->extents_found++;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+
+ clear_start = found_key.offset;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes - extent_num_bytes);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ }
+ clear_len = num_dec;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * We can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+ btrfs_file_extent_compression(leaf, fi) == 0) {
+ u32 size = (u32)(new_size - found_key.offset);
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(path, size, 1);
+ } else if (!del_item) {
+ /*
+ * We have to bail so the last_size is set to
+ * just before this extent.
+ */
+ ret = BTRFS_NEED_TRUNCATE_BLOCK;
+ break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
+ }
+
+ control->sub_bytes += item_end + 1 - new_size;
+ }
+delete:
+ /*
+ * We only want to clear the file extent range if we're
+ * modifying the actual inode's mapping, which is just the
+ * normal truncate path.
+ */
+ if (control->clear_extent_range) {
+ ret = btrfs_inode_clear_file_extent_range(control->inode,
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (del_item) {
+ ASSERT(!pending_del_nr ||
+ ((path->slots[0] + 1) == pending_del_slot));
+
+ control->last_size = found_key.offset;
+ if (!pending_del_nr) {
+ /* No pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* Hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ }
+ } else {
+ control->last_size = new_size;
+ break;
+ }
+
+ if (del_item && extent_start != 0 && !control->skip_ref_updates) {
+ struct btrfs_ref ref = { 0 };
+
+ bytes_deleted += extent_num_bytes;
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+ extent_start, extent_num_bytes, 0);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ control->ino, extent_offset,
+ root->root_key.objectid, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ if (be_nice) {
+ if (btrfs_should_throttle_delayed_refs(trans))
+ should_throttle = true;
+ }
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot ||
+ should_throttle) {
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
+ */
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
+ }
+
+ ASSERT(control->last_size >= new_size);
+ if (!ret && control->last_size > new_size)
+ control->last_size = new_size;
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
new file mode 100644
index 000000000000..a8fc16d0147f
--- /dev/null
+++ b/fs/btrfs/inode-item.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_INODE_ITEM_H
+#define BTRFS_INODE_ITEM_H
+
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_key;
+struct btrfs_inode_extref;
+struct btrfs_inode;
+struct extent_buffer;
+
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define BTRFS_NEED_TRUNCATE_BLOCK 1
+
+struct btrfs_truncate_control {
+ /*
+ * IN: the inode we're operating on, this can be NULL if
+ * ->clear_extent_range is false.
+ */
+ struct btrfs_inode *inode;
+
+ /* IN: the size we're truncating to. */
+ u64 new_size;
+
+ /* OUT: the number of extents truncated. */
+ u64 extents_found;
+
+ /* OUT: the last size we truncated this inode to. */
+ u64 last_size;
+
+ /* OUT: the number of bytes to sub from this inode. */
+ u64 sub_bytes;
+
+ /* IN: the ino we are truncating. */
+ u64 ino;
+
+ /*
+ * IN: minimum key type to remove. All key types with this type are
+ * removed only if their offset >= new_size.
+ */
+ u32 min_type;
+
+ /*
+ * IN: true if we don't want to do extent reference updates for any file
+ * extents we drop.
+ */
+ bool skip_ref_updates;
+
+ /*
+ * IN: true if we need to clear the file extent range for the inode as
+ * we drop the file extent items.
+ */
+ bool clear_extent_range;
+};
+
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control);
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot, const char *name,
+ int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const char *name, int name_len);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7c096ab9bb5e..95c499b8424e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -53,6 +54,7 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
+#include "inode-item.h"
struct btrfs_iget_args {
u64 ino;
@@ -60,12 +62,15 @@ struct btrfs_iget_args {
};
struct btrfs_dio_data {
- u64 reserve;
- loff_t length;
ssize_t submitted;
struct extent_changeset *data_reserved;
};
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, bool extent_inserted,
- struct btrfs_root *root, struct inode *inode,
- u64 start, size_t size, size_t compressed_size,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
+ struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
- unsigned long offset;
+ u64 i_size;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
@@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
size_t datasize;
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = start;
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->i_mapping,
- start >> PAGE_SHIFT);
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = offset_in_page(start);
- write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
}
@@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
- size = ALIGN(size, root->fs_info->sectorsize);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
if (ret)
goto fail;
/*
- * we're an inline extent, so nobody can
- * extend the file past i_size without locking
- * a page we already have locked.
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
*
- * We must do any isize and inode updates
- * before we unlock the pages. Otherwise we
- * could end up racing with unlink.
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
*/
- BTRFS_I(inode)->disk_i_size = inode->i_size;
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
+
fail:
return ret;
}
@@ -339,35 +348,31 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
- u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(&inode->vfs_inode);
- u64 actual_end = min(end + 1, isize);
- u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, fs_info->sectorsize);
- u64 data_len = inline_len;
+ u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;
- if (compressed_size)
- data_len = compressed_size;
-
- if (start > 0 ||
- actual_end > fs_info->sectorsize ||
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- (!compressed_size &&
- (actual_end & (fs_info->sectorsize - 1)) == 0) ||
- end + 1 < isize ||
- data_len > fs_info->max_inline) {
+ data_len > fs_info->max_inline)
return 1;
- }
path = btrfs_alloc_path();
if (!path)
@@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
trans->block_rsv = &inode->block_rsv;
drop_args.path = path;
- drop_args.start = start;
- drop_args.end = aligned_end;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
-
- if (compressed_size && compressed_pages)
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- compressed_size);
- else
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- inline_len);
-
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- if (isize > actual_end)
- inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
- root, &inode->vfs_inode, start,
- inline_len, compressed_size,
- compress_type, compressed_pages);
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -456,11 +451,10 @@ struct async_chunk {
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
- atomic_t *pending;
+ struct async_cow *async_cow;
};
struct async_cow {
- /* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
@@ -487,20 +481,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
}
/*
- * Check if the inode has flags compatible with compression
- */
-static inline bool inode_can_compress(struct btrfs_inode *inode)
-{
- /* Subpage doesn't support compression yet */
- if (inode->root->fs_info->sectorsize < PAGE_SIZE)
- return false;
- if (inode->flags & BTRFS_INODE_NODATACOW ||
- inode->flags & BTRFS_INODE_NODATASUM)
- return false;
- return true;
-}
-
-/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
@@ -509,12 +489,44 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!inode_can_compress(inode)) {
+ if (!btrfs_inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
btrfs_ino(inode));
return 0;
}
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(end + 1, PAGE_SIZE))
+ return 0;
+ }
+
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -532,12 +544,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
@@ -596,7 +608,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
@@ -616,13 +627,24 @@ again:
total_compressed = actual_end - start;
/*
- * skip compression for a small file range(<=blocksize) that
+ * Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ goto cleanup_and_bail_uncompressed;
+ }
+
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -696,14 +718,15 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
0, BTRFS_COMPRESS_NONE,
- NULL);
+ NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
total_compressed,
- compress_type, pages);
+ compress_type, pages,
+ false);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
@@ -760,7 +783,7 @@ cont:
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
- total_in = ALIGN(total_in, PAGE_SIZE);
+ total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
@@ -841,166 +864,151 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-/*
- * phase two of compressed writeback. This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued. We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct extent_map *em;
- struct btrfs_root *root = inode->root;
- struct extent_io_tree *io_tree = &inode->io_tree;
- int ret = 0;
-
-again:
- while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
-
-retry:
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
- /* did the compression code fall back to uncompressed IO? */
- if (!async_extent->pages) {
- int page_started = 0;
- unsigned long nr_written = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
- /* allocate blocks */
- ret = cow_file_range(inode, async_chunk->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ if (locked_page)
+ unlock_page(locked_page);
+ goto out;
+ }
- /* JDM XXX */
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
- /*
- * if page_started, cow_file_range inserted an
- * inline extent and took care of all the unlocking
- * and IO for us. Otherwise, we need to submit
- * all those pages down to the drive.
- */
- if (!page_started && !ret)
- extent_write_locked_range(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- WB_SYNC_ALL);
- else if (ret && async_chunk->locked_page)
- unlock_page(async_chunk->locked_page);
- kfree(async_extent);
- cond_resched();
- continue;
- }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
- ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
- 0, alloc_hint, &ins, 1, 1);
- if (ret) {
- free_async_extent_pages(async_extent);
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
- if (ret == -ENOSPC) {
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end);
- /*
- * we need to redirty the pages if we decide to
- * fallback to uncompressed IO, otherwise we
- * will not submit these pages down to lower
- * layers.
- */
- extent_range_redirty_for_io(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
- goto retry;
- }
- goto out_free;
- }
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
/*
- * here we're doing allocation and writeback of the
- * compressed pages
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
*/
- em = create_io_em(inode, async_extent->start,
- async_extent->ram_size, /* len */
- async_extent->start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
- if (IS_ERR(em))
- /* ret value is not necessary due to void function */
- goto out_free_reserve;
- free_extent_map(em);
-
- ret = btrfs_add_ordered_extent_compress(inode,
- async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- async_extent->compress_type);
- if (ret) {
- btrfs_drop_extent_cache(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
- goto out_free_reserve;
- }
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
+ }
+ free_extent_map(em);
- /*
- * clear dirty, set writeback and unlock the pages.
- */
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_START_WRITEBACK);
- if (btrfs_submit_compressed_write(inode, async_extent->start,
- async_extent->ram_size,
- ins.objectid,
- ins.offset, async_extent->pages,
- async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css)) {
- struct page *p = async_extent->pages[0];
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, false);
-
- p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
- alloc_hint = ins.objectid + ins.offset;
- kfree(async_extent);
- cond_resched();
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserve;
}
- return;
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css, true)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1008,7 +1016,39 @@ out_free:
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
- goto again;
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1077,7 +1117,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
- WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
}
@@ -1099,9 +1138,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* So here we skip inline extent creation completely.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
+
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1151,7 +1193,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* fails during the stage where it updates the bytenr of file extent
* items.
*/
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes;
else
min_alloc_size = fs_info->sectorsize;
@@ -1181,14 +1223,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size,
- BTRFS_ORDERED_REGULAR);
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto out_drop_extent_cache;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
@@ -1326,18 +1368,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
- /*
- * Since the pointer to 'pending' is at the beginning of the array of
- * async_chunk's, freeing it ensures the whole array has been freed.
- */
- if (atomic_dec_and_test(async_chunk->pending))
- kvfree(async_chunk->pending);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
}
static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1398,7 +1439,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
* lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
- async_chunk[i].pending = &ctx->num_chunks;
+ async_chunk[i].async_cow = ctx;
async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
@@ -1471,7 +1512,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
__set_page_dirty_nobuffers(locked_page);
account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1480,11 +1521,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
- int ret;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
+ int ret;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
+ ret = btrfs_lookup_csums_range(csum_root, bytenr,
bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1504,8 +1546,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
- const bool is_reloc_ino = (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
@@ -1844,10 +1885,11 @@ out_check:
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_PREALLOC);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, num_bytes, num_bytes,
+ disk_bytenr, num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
@@ -1856,9 +1898,11 @@ out_check:
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
+ num_bytes, num_bytes,
disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_NOCOW);
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto error;
}
@@ -1867,8 +1911,7 @@ out_check:
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
nocow = false;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
@@ -1947,11 +1990,25 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
int ret;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
if (should_nocow(inode, start, end)) {
- ASSERT(!zoned);
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
- } else if (!inode_can_compress(inode) ||
+ } else if (!btrfs_inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
ret = run_delalloc_zoned(inode, locked_page, start, end,
@@ -2207,7 +2264,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2235,48 +2292,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
/*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = bio->bi_iter.bi_sector << 9;
- u32 bio_len = bio->bi_iter.bi_size;
- struct extent_map *em;
- int ret = 0;
- struct btrfs_io_geometry geom;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
- return 0;
-
- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- if (IS_ERR(em))
- return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
- if (ret < 0)
- goto out;
-
- if (geom.len < bio_len + size)
- ret = 1;
-out:
- free_extent_map(em);
- return ret;
-}
-
-/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
@@ -2287,7 +2302,7 @@ out:
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
}
/*
@@ -2495,7 +2510,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- !fs_info->csum_root;
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -2515,10 +2530,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ /*
+ * btrfs_submit_compressed_read will handle completing
+ * the bio if there were any errors, so just return
+ * here.
+ */
ret = btrfs_submit_compressed_read(inode, bio,
mirror_num,
bio_flags);
- goto out;
+ goto out_no_endio;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2532,14 +2552,14 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto mapit;
} else if (async && !skip_sum) {
/* csum items have already been cloned */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
if (ret)
goto out;
}
@@ -2552,6 +2572,7 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
+out_no_endio:
return ret;
}
@@ -2563,11 +2584,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
struct list_head *list)
{
struct btrfs_ordered_sum *sum;
+ struct btrfs_root *csum_root = NULL;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
+ if (!csum_root)
+ csum_root = btrfs_csum_root(trans->fs_info,
+ sum->bytenr);
+ ret = btrfs_csum_file_blocks(trans, csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2765,7 +2790,7 @@ out_page:
clear_page_dirty_for_io(page);
SetPageError(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2820,7 +2845,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- SetPageChecked(page);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -2843,6 +2868,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -2917,7 +2943,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
goto out;
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
- file_pos, qgroup_reserved, &ins);
+ file_pos - offset,
+ qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2943,20 +2970,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
- u64 logical_len;
bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- logical_len = oe->truncated_len;
- else
- logical_len = oe->num_bytes;
- btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
- btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ num_bytes = ram_bytes = oe->truncated_len;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
@@ -2967,6 +2994,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
@@ -3001,7 +3029,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
@@ -3011,8 +3040,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->bdev)
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
btrfs_free_io_failure_record(inode, start, end);
@@ -3209,7 +3242,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
*
* The length of such check is always one sector size.
*/
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
@@ -3225,7 +3258,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
ASSERT(pgoff + len <= PAGE_SIZE);
offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+ csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -3239,9 +3272,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
- if (io_bio->device)
- btrfs_dev_stat_inc_and_print(io_bio->device,
+ bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3261,39 +3294,35 @@ zeroit:
* Return a bitmap where bit set means a csum mismatch, and bit not set means
* csum match.
*/
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
- if (PageChecked(page)) {
- ClearPageChecked(page);
+ if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+ btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
return 0;
}
/*
- * For subpage case, above PageChecked is not safe as it's not subpage
- * compatible.
- * But for now only cow fixup and compressed read utilize PageChecked
- * flag, while in this context we can easily use io_bio->csum to
- * determine if we really need to do csum verification.
- *
- * So for now, just exit if io_bio->csum is NULL, as it means it's
- * compressed read, and its compressed data csum has already been
- * verified.
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
*/
- if (io_bio->csum == NULL)
+ if (bbio->csum == NULL)
return 0;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (!root->fs_info->csum_root)
+ if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
return 0;
ASSERT(page_offset(page) <= start &&
@@ -3304,7 +3333,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
u64 file_offset = pg_off + page_offset(page);
int ret;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (btrfs_is_data_reloc_root(root) &&
test_range_bit(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM, 1, NULL)) {
@@ -3314,7 +3343,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -3454,7 +3483,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
path = btrfs_alloc_path();
@@ -3612,8 +3641,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* release the path since we're done with it */
btrfs_release_path(path);
- root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
-
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
@@ -4005,7 +4032,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
@@ -4035,11 +4062,12 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
+ struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
@@ -4093,26 +4121,28 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
- ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto err;
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unncessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
+ index);
}
- ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
- if (ret == -ENOENT)
- ret = 0;
- else if (ret)
- btrfs_abort_transaction(trans, ret);
-
/*
* If we have a pending delayed iput we could end up with the final iput
* being run in btrfs-cleaner context. If we have enough of these built
@@ -4139,15 +4169,14 @@ out:
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
}
return ret;
}
@@ -4176,7 +4205,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
@@ -4188,7 +4216,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
@@ -4202,7 +4230,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return ret;
}
@@ -4369,7 +4397,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -4447,6 +4475,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -4552,15 +4587,21 @@ out_up_write:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
return btrfs_delete_subvolume(dir, dentry);
+ }
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4578,7 +4619,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
@@ -4599,395 +4640,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
/*
- * Return this if we need to call truncate_block for the last bit of the
- * truncate.
- */
-#define NEED_TRUNCATE_BLOCK 1
-
-/*
- * Remove inode items from a given root.
- *
- * @trans: A transaction handle.
- * @root: The root from which to remove items.
- * @inode: The inode whose items we want to remove.
- * @new_size: The new i_size for the inode. This is only applicable when
- * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
- * @min_type: The minimum key type to remove. All keys with a type
- * greater than this value are removed and all keys with
- * this type are removed only if their offset is >= @new_size.
- * @extents_found: Output parameter that will contain the number of file
- * extent items that were removed or adjusted to the new
- * inode i_size. The caller is responsible for initializing
- * the counter. Also, it can be NULL if the caller does not
- * need this counter.
- *
- * Remove all keys associated with the inode from the given root that have a key
- * with a type greater than or equals to @min_type. When @min_type has a value of
- * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
- * greater than or equals to @new_size. If a file extent item that starts before
- * @new_size and ends after it is found, its length is adjusted.
- *
- * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
- * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
- */
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode,
- u64 new_size, u32 min_type,
- u64 *extents_found)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *fi;
- struct btrfs_key key;
- struct btrfs_key found_key;
- u64 extent_start = 0;
- u64 extent_num_bytes = 0;
- u64 extent_offset = 0;
- u64 item_end = 0;
- u64 last_size = new_size;
- u32 found_type = (u8)-1;
- int found_extent;
- int del_item;
- int pending_del_nr = 0;
- int pending_del_slot = 0;
- int extent_type = -1;
- int ret;
- u64 ino = btrfs_ino(inode);
- u64 bytes_deleted = 0;
- bool be_nice = false;
- bool should_throttle = false;
- const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
- struct extent_state *cached_state = NULL;
-
- BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-
- /*
- * For non-free space inodes and non-shareable roots, we want to back
- * off from time to time. This means all inodes in subvolume roots,
- * reloc roots, and data reloc roots.
- */
- if (!btrfs_is_free_space_inode(inode) &&
- test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- be_nice = true;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_BACK;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
- &cached_state);
-
- /*
- * We want to drop from the next block forward in case this
- * new size is not block aligned since we will be keeping the
- * last block of the extent just the way it is.
- */
- btrfs_drop_extent_cache(inode, ALIGN(new_size,
- fs_info->sectorsize),
- (u64)-1, 0);
- }
-
- /*
- * This function is also used to drop the items in the log tree before
- * we relog the inode, so if root != BTRFS_I(inode)->root, it means
- * it is used to drop the logged items. So we shouldn't kill the delayed
- * items.
- */
- if (min_type == 0 && root == inode->root)
- btrfs_kill_delayed_inode_items(inode);
-
- key.objectid = ino;
- key.offset = (u64)-1;
- key.type = (u8)-1;
-
-search_again:
- /*
- * with a 16K leaf size and 128MB extents, you can actually queue
- * up a huge file in a single leaf. Most of the time that
- * bytes_deleted is > 0, it will be huge by the time we get here
- */
- if (be_nice && bytes_deleted > SZ_32M &&
- btrfs_should_end_transaction(trans)) {
- ret = -EAGAIN;
- goto out;
- }
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = 0;
- /* there are no items in the tree for us to truncate, we're
- * done
- */
- if (path->slots[0] == 0)
- goto out;
- path->slots[0]--;
- }
-
- while (1) {
- u64 clear_start = 0, clear_len = 0;
-
- fi = NULL;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
-
- if (found_key.objectid != ino)
- break;
-
- if (found_type < min_type)
- break;
-
- item_end = found_key.offset;
- if (found_type == BTRFS_EXTENT_DATA_KEY) {
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(leaf, fi);
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- item_end +=
- btrfs_file_extent_num_bytes(leaf, fi);
-
- trace_btrfs_truncate_show_fi_regular(
- inode, leaf, fi, found_key.offset);
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- item_end += btrfs_file_extent_ram_bytes(leaf,
- fi);
-
- trace_btrfs_truncate_show_fi_inline(
- inode, leaf, fi, path->slots[0],
- found_key.offset);
- }
- item_end--;
- }
- if (found_type > min_type) {
- del_item = 1;
- } else {
- if (item_end < new_size)
- break;
- if (found_key.offset >= new_size)
- del_item = 1;
- else
- del_item = 0;
- }
- found_extent = 0;
- /* FIXME, shrink the extent if the ref count is only 1 */
- if (found_type != BTRFS_EXTENT_DATA_KEY)
- goto delete;
-
- if (extents_found != NULL)
- (*extents_found)++;
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- u64 num_dec;
-
- clear_start = found_key.offset;
- extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (!del_item) {
- u64 orig_num_bytes =
- btrfs_file_extent_num_bytes(leaf, fi);
- extent_num_bytes = ALIGN(new_size -
- found_key.offset,
- fs_info->sectorsize);
- clear_start = ALIGN(new_size, fs_info->sectorsize);
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_num_bytes);
- num_dec = (orig_num_bytes -
- extent_num_bytes);
- if (test_bit(BTRFS_ROOT_SHAREABLE,
- &root->state) &&
- extent_start != 0)
- inode_sub_bytes(&inode->vfs_inode,
- num_dec);
- btrfs_mark_buffer_dirty(leaf);
- } else {
- extent_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf,
- fi);
- extent_offset = found_key.offset -
- btrfs_file_extent_offset(leaf, fi);
-
- /* FIXME blocksize != 4096 */
- num_dec = btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_start != 0) {
- found_extent = 1;
- if (test_bit(BTRFS_ROOT_SHAREABLE,
- &root->state))
- inode_sub_bytes(&inode->vfs_inode,
- num_dec);
- }
- }
- clear_len = num_dec;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- /*
- * we can't truncate inline items that have had
- * special encodings
- */
- if (!del_item &&
- btrfs_file_extent_encryption(leaf, fi) == 0 &&
- btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
- btrfs_file_extent_compression(leaf, fi) == 0) {
- u32 size = (u32)(new_size - found_key.offset);
-
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(path, size, 1);
- } else if (!del_item) {
- /*
- * We have to bail so the last_size is set to
- * just before this extent.
- */
- ret = NEED_TRUNCATE_BLOCK;
- break;
- } else {
- /*
- * Inline extents are special, we just treat
- * them as a full sector worth in the file
- * extent tree just for simplicity sake.
- */
- clear_len = fs_info->sectorsize;
- }
-
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- inode_sub_bytes(&inode->vfs_inode,
- item_end + 1 - new_size);
- }
-delete:
- /*
- * We use btrfs_truncate_inode_items() to clean up log trees for
- * multiple fsyncs, and in this case we don't want to clear the
- * file extent range because it's just the log.
- */
- if (root == inode->root) {
- ret = btrfs_inode_clear_file_extent_range(inode,
- clear_start, clear_len);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- }
-
- if (del_item)
- last_size = found_key.offset;
- else
- last_size = new_size;
- if (del_item) {
- if (!pending_del_nr) {
- /* no pending yet, add ourselves */
- pending_del_slot = path->slots[0];
- pending_del_nr = 1;
- } else if (pending_del_nr &&
- path->slots[0] + 1 == pending_del_slot) {
- /* hop on the pending chunk */
- pending_del_nr++;
- pending_del_slot = path->slots[0];
- } else {
- BUG();
- }
- } else {
- break;
- }
- should_throttle = false;
-
- if (found_extent &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_ref ref = { 0 };
-
- bytes_deleted += extent_num_bytes;
-
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
- ref.real_root = root->root_key.objectid;
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset);
- ret = btrfs_free_extent(trans, &ref);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- if (be_nice) {
- if (btrfs_should_throttle_delayed_refs(trans))
- should_throttle = true;
- }
- }
-
- if (found_type == BTRFS_INODE_ITEM_KEY)
- break;
-
- if (path->slots[0] == 0 ||
- path->slots[0] != pending_del_slot ||
- should_throttle) {
- if (pending_del_nr) {
- ret = btrfs_del_items(trans, root, path,
- pending_del_slot,
- pending_del_nr);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- pending_del_nr = 0;
- }
- btrfs_release_path(path);
-
- /*
- * We can generate a lot of delayed refs, so we need to
- * throttle every once and a while and make sure we're
- * adding enough space to keep up with the work we are
- * generating. Since we hold a transaction here we
- * can't flush, and we don't want to FLUSH_LIMIT because
- * we could have generated too many delayed refs to
- * actually allocate, so just bail if we're short and
- * let the normal reservation dance happen higher up.
- */
- if (should_throttle) {
- ret = btrfs_delayed_refs_rsv_refill(fs_info,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret) {
- ret = -EAGAIN;
- break;
- }
- }
- goto search_again;
- } else {
- path->slots[0]--;
- }
- }
-out:
- if (ret >= 0 && pending_del_nr) {
- int err;
-
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
- }
- }
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ASSERT(last_size >= new_size);
- if (!ret && last_size > new_size)
- last_size = new_size;
- btrfs_inode_safe_disk_i_size_write(inode, last_size);
- unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
- &cached_state);
- }
-
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
@@ -5035,7 +4693,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -5106,7 +4764,8 @@ again:
len);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
@@ -5246,8 +4905,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
goto next;
}
hole_em->start = cur_offset;
@@ -5416,16 +5074,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
/*
- * While truncating the inode pages during eviction, we get the VFS calling
- * btrfs_invalidatepage() against each page of the inode. This is slow because
- * the calls to btrfs_invalidatepage() result in a huge amount of calls to
- * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
- * extent_state structures over and over, wasting lots of time.
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
*
- * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
- * those expensive operations on a per page basis and do only the ordered io
- * finishing, while we release here the extent_map and extent_state structures,
- * without the excessive merging and splitting.
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
@@ -5491,7 +5150,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
- * (Refer to comment in btrfs_invalidatepage, case 2)
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
@@ -5514,7 +5173,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_trans_handle *trans;
u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
int ret;
@@ -5529,18 +5187,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* above. We reserve our extra bit here because we generate a ton of
* delayed refs activity by truncating.
*
- * If we cannot make our reservation we'll attempt to steal from the
- * global reserve, because we really want to be able to free up space.
+ * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
+ * if we fail to make this reservation we can re-try without the
+ * delayed_refs_extra so we can make some forward progress.
*/
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_EVICT);
if (ret) {
- /*
- * Try to steal from the global reserve if there is space for
- * it.
- */
- if (btrfs_check_space_for_delayed_refs(fs_info) ||
- btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
btrfs_warn(fs_info,
"could not allocate space for delete; will truncate on mount");
return ERR_PTR(-ENOSPC);
@@ -5599,10 +5255,22 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ /*
+ * This makes sure the inode item in tree is uptodate and the space for
+ * the inode update is released.
+ */
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
if (ret)
goto no_delete;
+ /*
+ * This drops any pending insert or delete operations we have for this
+ * inode. We could have a delayed dir index deletion queued up, but
+ * we're removing the inode completely so that'll be taken care of in
+ * the truncate.
+ */
+ btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
@@ -5612,14 +5280,20 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .new_size = 0,
+ .min_type = 0,
+ };
+
trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
trans->block_rsv = rsv;
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0, NULL);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -5939,21 +5613,17 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
static inline u8 btrfs_inode_type(struct inode *inode)
{
- /*
- * Compile-time asserts that generic FT_* types still match
- * BTRFS_FT_* types
- */
- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
-
return fs_umode_to_ftype(inode->i_mode);
}
@@ -6326,14 +5996,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
goto out;
ret = 0;
- /*
- * MAGIC NUMBER EXPLANATION:
- * since we search a directory based on f_pos we have to start at 2
- * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
- * else has to start at 2
- */
if (path->slots[0] == 0) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -6344,7 +6008,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -6436,7 +6100,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
- int nitems = name ? 2 : 1;
+ struct btrfs_item_batch batch;
unsigned long ptr;
unsigned int nofs_flag;
int ret;
@@ -6495,7 +6159,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
@@ -6528,7 +6192,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+ batch.nr = name ? 2 : 1;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret != 0)
goto fail_unlock;
@@ -6888,7 +6556,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -6983,8 +6651,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
- inline_size = btrfs_file_extent_inline_item_len(leaf,
- btrfs_item_nr(path->slots[0]));
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
tmp = kmalloc(inline_size, GFP_NOFS);
if (!tmp)
return -ENOMEM;
@@ -7392,8 +7059,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
- block_len, type);
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
if (ret) {
if (em) {
free_extent_map(em);
@@ -7758,6 +7428,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 prev_len;
int ret = 0;
/*
@@ -7772,9 +7447,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
-
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
else
@@ -7784,53 +7456,91 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
if (can_nocow_extent(inode, start, &len, &orig_start,
&orig_block_len, &ram_bytes, false) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start)) {
- struct extent_map *em2;
+ btrfs_inc_nocow_writers(fs_info, block_start))
+ can_nocow = true;
+ }
+
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
btrfs_dec_nocow_writers(fs_info, block_start);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em = em2;
- }
+ goto out;
+ }
+ space_reserved = true;
- if (em2 && IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
- /*
- * For inode marked NODATACOW or extent marked PREALLOC,
- * use the existing or preallocated extent, so does not
- * need to adjust btrfs_space_info's bytes_may_use.
- */
- btrfs_free_reserved_data_space_noquota(fs_info, len);
- goto skip_cow;
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(fs_info, block_start);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em = em2;
}
- }
- /* this will cow the extent */
- free_extent_map(em);
- *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ /* We have to COW, so need to reserve metadata and data space. */
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, len);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start + len, prev_len - len,
+ true);
}
- len = min(len, em->len - (start - em->start));
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-skip_cow:
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
-
- dio_data->reserve -= len;
out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ if (can_nocow) {
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ } else {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, len, true);
+ extent_changeset_free(dio_data->data_reserved);
+ dio_data->data_reserved = NULL;
+ }
+ }
return ret;
}
@@ -7872,18 +7582,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (!dio_data)
return -ENOMEM;
- dio_data->length = length;
- if (write) {
- dio_data->reserve = round_up(length, fs_info->sectorsize);
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, dio_data->reserve);
- if (ret) {
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- return ret;
- }
- }
iomap->private = dio_data;
@@ -7924,6 +7622,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
@@ -7962,7 +7688,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -7976,14 +7702,8 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data) {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved, start,
- dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- }
+ kfree(dio_data);
+
return ret;
}
@@ -8013,14 +7733,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ret = -ENOTBLK;
}
- if (write) {
- if (dio_data->reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved, pos,
- dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ if (write)
extent_changeset_free(dio_data->data_reserved);
- }
out:
kfree(dio_data);
iomap->private = NULL;
@@ -8039,13 +7753,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->logical_offset,
+ dip->file_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
- dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1);
}
bio_endio(dip->dio_bio);
@@ -8073,10 +7787,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
return ret;
}
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
- struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
const bool uptodate)
{
+ struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8084,19 +7799,20 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- u64 start = io_bio->logical;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
- __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
unsigned int i, nr_sectors, pgoff;
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
+ u64 start = bbio->file_offset + bio_offset;
+
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio,
+ (!csum || !check_data_csum(inode, bbio,
bio_offset, bvec.bv_page,
pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8106,17 +7822,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
int ret;
- ASSERT((start - io_bio->logical) < UINT_MAX);
- ret = btrfs_repair_one_sector(inode,
- &io_bio->bio,
- start - io_bio->logical,
- bvec.bv_page, pgoff,
- start, io_bio->mirror_num,
+ ret = btrfs_repair_one_sector(inode, &bbio->bio,
+ bio_offset, bvec.bv_page, pgoff,
+ start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
- start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
@@ -8137,12 +7849,13 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t err = bio->bi_status;
if (err)
@@ -8152,15 +7865,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (bio_op(bio) == REQ_OP_READ) {
- err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
- !err);
- }
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8196,16 +7907,16 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret)
goto err;
} else {
u64 csum_offset;
- csum_offset = file_offset - dip->logical_offset;
+ csum_offset = file_offset - dip->file_offset;
csum_offset >>= fs_info->sectorsize_bits;
csum_offset *= fs_info->csum_size;
- btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8240,7 +7951,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return NULL;
dip->inode = inode;
- dip->logical_offset = file_offset;
+ dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
@@ -8248,7 +7959,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8278,7 +7989,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8321,7 +8032,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8372,15 +8083,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -8415,8 +8124,13 @@ int btrfs_readpage(struct file *file, struct page *page)
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio)
- ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (bio_ctrl.bio) {
+ int ret2;
+
+ ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (ret == 0)
+ ret = ret2;
+ }
return ret;
}
@@ -8457,8 +8171,8 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidatepage() we have a race window where
- * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * For releasepage() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
@@ -8534,48 +8248,48 @@ static int btrfs_migratepage(struct address_space *mapping,
}
#endif
-static void btrfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
- * We have page locked so no new ordered extent can be created on this
- * page, nor bio can be submitted for this page.
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
*
- * But already submitted bio can still be finished on this page.
- * Furthermore, endio function won't skip page which has Ordered
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
* (Private2) already cleared, so it's possible for endio and
- * invalidatepage to do the same ordered extent accounting twice
- * on one page.
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
*
* So here we wait for any submitted bios to finish, so that we won't
- * do double ordered extent accounting on the same page.
+ * do double ordered extent accounting on the same folio.
*/
- wait_on_page_writeback(page);
- wait_subpage_spinlock(page);
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
/*
* For subpage case, we have call sites like
* btrfs_punch_hole_lock_range() which passes range not aligned to
* sectorsize.
- * If the range doesn't cover the full page, we don't need to and
- * shouldn't clear page extent mapped, as page->private can still
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
* record subpage dirty bits for other part of the range.
*
- * For cases that can invalidate the full even the range doesn't
- * cover the full page, like invalidating the last page, we're
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
* still safe to wait for ordered extent to finish.
*/
- if (!(offset == 0 && length == PAGE_SIZE)) {
- btrfs_releasepage(page, GFP_NOFS);
+ if (!(offset == 0 && length == folio_size(folio))) {
+ btrfs_releasepage(&folio->page, GFP_NOFS);
return;
}
@@ -8616,7 +8330,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8626,7 +8340,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
delete_states = false;
goto next;
}
- btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8696,11 +8410,11 @@ next:
* should not have Ordered (Private2) anymore, or the above iteration
* did something wrong.
*/
- ASSERT(!PageOrdered(page));
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(page, GFP_NOFS);
- ClearPageChecked(page);
- clear_page_extent_mapped(page);
+ __btrfs_releasepage(&folio->page, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
}
/*
@@ -8843,7 +8557,7 @@ again:
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -8872,6 +8586,12 @@ out_noreserve:
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
@@ -8879,7 +8599,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
- u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8940,10 +8659,30 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
trans->block_rsv = rsv;
while (1) {
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- inode->i_size,
- BTRFS_EXTENT_DATA_KEY,
- &extents_found);
+ struct extent_state *cached_state = NULL;
+ const u64 new_size = inode->i_size;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+
+ control.new_size = new_size;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+ /*
+ * We want to drop from the next block forward in case this new
+ * size is not block aligned since we will be keeping the last
+ * block of the extent just the way it is.
+ */
+ btrfs_drop_extent_cache(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, 0);
+
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+ (u64)-1, &cached_state);
+
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8971,11 +8710,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/*
* We can't call btrfs_truncate_block inside a trans handle as we could
- * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
- * we've truncated everything except the last little bit, and can do
- * btrfs_truncate_block and then update the disk_i_size.
+ * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
+ * know we've truncated everything except the last little bit, and can
+ * do btrfs_truncate_block and then update the disk_i_size.
*/
- if (ret == NEED_TRUNCATE_BLOCK) {
+ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -9019,8 +8758,8 @@ out:
* between the old i_size and the new i_size, and there were no prealloc
* extents beyond i_size to drop.
*/
- if (extents_found > 0)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ if (control.extents_found > 0)
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
return ret;
}
@@ -9073,7 +8812,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -9153,8 +8892,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
WARN_ON(inode->block_rsv.reserved);
WARN_ON(inode->block_rsv.size);
WARN_ON(inode->outstanding_extents);
- WARN_ON(inode->delalloc_bytes);
- WARN_ON(inode->new_delalloc_bytes);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
WARN_ON(inode->csum_bytes);
WARN_ON(inode->defrag_bytes);
@@ -9314,14 +9055,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
int ret2;
- bool root_log_pinned = false;
- bool dest_log_pinned = false;
bool need_abort = false;
/*
@@ -9424,37 +9165,15 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
- /*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
- *
- * We pin the logs even if at this precise moment none of the inodes was
- * logged before. This is because right after we checked for that, some
- * other task fsyncing some other inode not involved with this rename
- * operation could log that one of our inodes exists.
- *
- * We don't need to pin the logs before the above calls to
- * btrfs_insert_inode_ref(), since those don't ever need to change a log.
- */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
- }
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
- }
-
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9467,10 +9186,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
- ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9500,46 +9220,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
- if (root_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- old_dentry->d_parent);
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
-out_fail:
/*
- * If we have pinned a log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
*/
- if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
- btrfs_set_log_full_commit(trans);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
- if (root_log_pinned) {
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
- }
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+out_fail:
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9614,11 +9319,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
- bool log_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9723,29 +9428,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
- /*
- * Now pin the log. We do it to ensure that no other task can
- * sync the log while we are in progress with the rename, as
- * that could result in an inconsistency in case any of the
- * inodes that are part of this rename operation were logged
- * before.
- *
- * We pin the log even if at this precise moment none of the
- * inodes was logged before. This is because right after we
- * checked for that, some other task fsyncing some other inode
- * not involved with this rename operation could log that one of
- * our inodes exists.
- *
- * We don't need to pin the logs before the above call to
- * btrfs_insert_inode_ref(), since that does not need to change
- * a log.
- */
- btrfs_pin_log_trans(root);
- log_pinned = true;
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9762,7 +9449,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
- ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9787,12 +9474,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
@@ -9804,28 +9488,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
}
out_fail:
- /*
- * If we have pinned the log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && log_pinned) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9980,7 +9642,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
};
struct btrfs_fs_info *fs_info = root->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9999,7 +9661,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
struct list_head splice;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
INIT_LIST_HEAD(&splice);
@@ -10305,8 +9967,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em = alloc_extent_map();
if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10388,11 +10049,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static int btrfs_set_page_dirty(struct page *page)
-{
- return __set_page_dirty_nobuffers(page);
-}
-
static int btrfs_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
@@ -10494,6 +10150,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
+static int btrfs_encoded_io_compression_from_extent(
+ struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret) {
+ btrfs_bio_free_csum(bbio);
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret) {
+ atomic_dec(&priv->pending);
+ btrfs_bio_free_csum(bbio);
+ }
+ return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u64 start = priv->file_offset;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff, start))
+ return BLK_STS_IOERR;
+ start += sectorsize;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset,
+ u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ bio->bi_end_io = btrfs_encoded_read_endio;
+ bio->bi_private = &priv;
+ bio->bi_opf = REQ_OP_READ;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent_bits(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap(pages[i]);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap(pages[i]);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent_bits(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
@@ -10581,9 +10978,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10591,6 +10998,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
@@ -10691,8 +11099,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
@@ -10939,12 +11362,12 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
- .invalidatepage = btrfs_invalidatepage,
+ .invalidate_folio = btrfs_invalidate_folio,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
- .set_page_dirty = btrfs_set_page_dirty,
+ .dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..be6c24577dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -48,6 +49,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,11 +83,30 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+ compat_uptr_t iov;
+ compat_ulong_t iovcnt;
+ __s64 offset;
+ __u64 flags;
+ __u64 len;
+ __u64 unencoded_len;
+ __u64 unencoded_offset;
+ __u32 compression;
+ __u32 encryption;
+ __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
@@ -385,6 +406,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
*
* Compatibility:
* - the same type is already running
+ * - when trying to add a device and balance has been paused
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
* must check the condition first that would allow none -> @type
*/
@@ -392,7 +414,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == type)
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
return true;
spin_unlock(&fs_info->super_lock);
@@ -412,10 +436,31 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
-static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
{
- struct inode *inode = file_inode(file);
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
+ }
+}
+static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+{
return put_user(inode->i_generation, arg);
}
@@ -516,7 +561,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct timespec64 cur_time = current_time(dir);
struct inode *inode;
int ret;
- int err;
dev_t anon_dev = 0;
u64 objectid;
u64 index = 0;
@@ -615,11 +659,13 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
* filesystem in an inconsistent state (an extent item in the
- * extent tree without backreferences). Also no need to have
- * the tree block locked since it is not in any tree at this
- * point, so no other task can find it and use it.
+ * extent tree with a backreference for a root that does not
+ * exists).
*/
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_tree_lock(leaf);
+ btrfs_clean_tree_block(leaf);
+ btrfs_tree_unlock(leaf);
+ btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
goto fail;
}
@@ -694,9 +740,10 @@ fail:
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
- err = btrfs_commit_transaction(trans);
- if (err && !ret)
- ret = err;
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_commit_transaction(trans);
if (!ret) {
inode = btrfs_lookup_dentry(dir, dentry);
@@ -723,6 +770,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
+ /* We do not support snapshotting right now. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_warn(fs_info,
+ "extent tree v2 doesn't support snapshotting yet");
+ return -EOPNOTSUPP;
+ }
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -775,10 +829,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto fail;
}
- spin_lock(&fs_info->trans_lock);
- list_add(&pending_snapshot->list,
- &trans->transaction->pending_snapshots);
- spin_unlock(&fs_info->trans_lock);
+ trans->pending_snapshot = pending_snapshot;
ret = btrfs_commit_transaction(trans);
if (ret)
@@ -986,128 +1037,192 @@ out:
}
/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
*/
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
}
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
-}
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
-{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
}
- if (min_key.offset == (u64)-1)
- goto none;
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
- min_key.offset++;
- btrfs_release_path(path);
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
}
-none:
- btrfs_free_path(path);
- return -ENOENT;
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ u64 newer_than, bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1116,317 +1231,563 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static u32 get_extent_max_capacity(const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return BTRFS_MAX_EXTENT_SIZE;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ u32 extent_thresh, u64 newer_than, bool locked)
{
struct extent_map *next;
- bool ret = true;
+ bool ret = false;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ /*
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- ret = false;
- else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > SZ_128K && next->block_len > SZ_128K))
- ret = false;
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(em))
+ goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+ ret = true;
+out:
free_extent_map(next);
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
/*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
- /*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
- */
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- u64 start = (u64)start_index << PAGE_SHIFT;
- u64 search_start;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
+
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
+ if (!em)
+ break;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ /* Skip hole/inline/preallocated extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start, page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- break;
- }
+ /*
+ * Our start offset might be in the middle of an existing extent
+ * map, so take that into account.
+ */
+ range_len = em->len - (cur - em->start);
+ /*
+ * If this range of the extent map is already flagged for delalloc,
+ * skip it, because:
+ *
+ * 1) We could deadlock later, when trying to reserve space for
+ * delalloc, because in case we can't immediately reserve space
+ * the flusher can start delalloc and wait for the respective
+ * ordered extents to complete. The deadlock would happen
+ * because we do the space reservation while holding the range
+ * locked, and starting writeback, or finishing an ordered
+ * extent, requires locking the range;
+ *
+ * 2) If there's delalloc there, it means there's dirty pages for
+ * which writeback has not started yet (we clean the delalloc
+ * flag when starting writeback and after creating an ordered
+ * extent). If we mark pages in an adjacent range for defrag,
+ * then we will have a larger contiguous range for delalloc,
+ * very likely resulting in a larger extent after writeback is
+ * triggered (except in a case of free space fragmentation).
+ */
+ if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC, 0, NULL))
+ goto next;
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
+ /* Skip too large extent */
+ if (range_len >= extent_thresh)
+ goto next;
+
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(em))
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ extent_thresh, newer_than, locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
+add:
+ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- pages[i] = page;
- i_done++;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
}
- if (!i_done || ret)
- goto out;
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ }
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
- wait_on_page_writeback(pages[i]);
+#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ ASSERT(last_index - first_index + 1 <= nr_pages);
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ }
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+
+ /* Lock the pages range */
+ lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
- * When defragmenting we skip ranges that have holes or inline extents,
- * (check should_defrag_range()), to avoid unnecessary IO and wasting
- * space. At btrfs_defrag_file(), we check if a range should be defragged
- * before locking the inode and then, if it should, we trigger a sync
- * page cache readahead - we lock the inode only after that to avoid
- * blocking for too long other tasks that possibly want to operate on
- * other file ranges. But before we were able to get the inode lock,
- * some other task may have punched a hole in the range, or we may have
- * now an inline extent, in which case we should not defrag. So check
- * for that here, where we have the inode and the range locked, and bail
- * out if that happened.
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
*/
- search_start = page_start;
- while (search_start < page_end) {
- struct extent_map *em;
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list, last_scanned_ret);
+ if (ret < 0)
+ goto unlock_extent;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock_range;
- }
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- /* Ok, 0 means we did not defrag anything */
- ret = 0;
- goto out_unlock_range;
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
}
- search_start = extent_map_end(em);
- free_extent_map(em);
}
+ kfree(pages);
+ return ret;
+}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, (page_cnt - i_done) << PAGE_SHIFT, true);
- }
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list, NULL);
+ if (ret < 0)
+ goto out;
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ /* Reached or beyond the limit */
+ if (max_sectors && *sectors_defragged >= max_sectors) {
+ ret = 1;
+ break;
+ }
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
-out_unlock_range:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len >>
+ inode->root->fs_info->sectorsize_bits;
+ }
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ * defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ pgoff_t start_index;
if (isize == 0)
return 0;
@@ -1444,172 +1805,113 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
}
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
- }
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
-
/*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
+ * Make writeback start from the beginning of the range, so that the
+ * defrag range can be written sequentially.
*/
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
+ start_index = cur >> PAGE_SHIFT;
+ if (start_index < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = start_index;
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
+ while (cur < last_byte) {
+ const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
+ u64 cluster_end;
if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
- goto error;
- }
-
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
-
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
+ break;
}
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
btrfs_inode_unlock(inode, 0);
- goto out_ra;
+ break;
}
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
- btrfs_inode_unlock(inode, 0);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
-
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
+ if (sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
+ btrfs_inode_unlock(inode, 0);
+ if (ret < 0)
+ break;
+ cur = max(cluster_end + 1, last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
}
+ cond_resched();
}
- ret = defrag_count;
-error:
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ /*
+ * Update range.start for autodefrag, this will indicate where to start
+ * in next run.
+ */
+ range->start = cur;
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
-
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- }
-
-out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
}
- if (!file)
- kfree(ra);
- kfree(pages);
return ret;
}
@@ -1658,6 +1960,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1713,7 +2016,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -1730,7 +2034,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +2075,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
@@ -1954,10 +2258,9 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -2121,7 +2424,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
item_off = btrfs_item_ptr_offset(leaf, i);
- item_len = btrfs_item_size_nr(leaf, i);
+ item_len = btrfs_item_size(leaf, i);
btrfs_item_key_to_cpu(leaf, key, i);
if (!key_in_sk(key, sk))
@@ -2261,9 +2564,8 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = fault_in_pages_writeable(ubuf + sk_offset,
- *buf_size - sk_offset);
- if (ret)
+ ret = -EFAULT;
+ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -2288,12 +2590,11 @@ err:
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct file *file,
- void __user *argp)
+static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+ void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs;
struct btrfs_ioctl_search_key sk;
- struct inode *inode;
int ret;
size_t buf_size;
@@ -2307,7 +2608,6 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
buf_size = sizeof(uargs->buf);
- inode = file_inode(file);
ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
/*
@@ -2322,12 +2622,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg;
struct btrfs_ioctl_search_args_v2 args;
- struct inode *inode;
int ret;
size_t buf_size;
const size_t buf_limit = SZ_16M;
@@ -2346,7 +2645,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
if (buf_size > buf_limit)
buf_size = buf_limit;
- inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
@@ -2576,7 +2874,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
btrfs_item_key_to_cpu(leaf, &key, slot);
item_off = btrfs_item_ptr_offset(leaf, slot);
- item_len = btrfs_item_size_nr(leaf, slot);
+ item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
@@ -2597,25 +2895,22 @@ out:
return ret;
}
-static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
- struct inode *inode;
int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
- inode = file_inode(file);
-
/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ args->treeid = root->root_key.objectid;
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2627,7 +2922,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
goto out;
}
- ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_search_path_in_tree(root->fs_info,
args->treeid, args->objectid,
args->name);
@@ -2683,7 +2978,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
-static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
@@ -2695,7 +2990,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
- struct inode *inode;
int slot;
int ret = 0;
@@ -2709,7 +3003,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
return -ENOMEM;
}
- inode = file_inode(file);
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
@@ -2778,7 +3071,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
item_off = btrfs_item_ptr_offset(leaf, slot)
+ sizeof(struct btrfs_root_ref);
- item_len = btrfs_item_size_nr(leaf, slot)
+ item_len = btrfs_item_size(leaf, slot)
- sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, subvol_info->name,
item_off, item_len);
@@ -2803,15 +3096,14 @@ out_free:
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
-static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
+ void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct inode *inode;
u64 objectid;
int slot;
int ret;
@@ -2827,15 +3119,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
return PTR_ERR(rootrefs);
}
- inode = file_inode(file);
- root = BTRFS_I(inode)->root->fs_info->tree_root;
- objectid = BTRFS_I(inode)->root->root_key.objectid;
-
+ objectid = root->root_key.objectid;
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;
+ root = root->fs_info->tree_root;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
@@ -2915,6 +3205,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int err = 0;
bool destroy_parent = false;
+ /* We don't support snapshots with extent tree v2 yet. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
+
if (destroy_v2) {
vol_args2 = memdup_user(arg, sizeof(*vol_args2));
if (IS_ERR(vol_args2))
@@ -3098,10 +3395,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
btrfs_inode_unlock(inode, 0);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
@@ -3136,12 +3431,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- /* Subpage defrag will be supported in later commits */
- if (root->fs_info->sectorsize < PAGE_SIZE) {
- ret = -ENOTTY;
- goto out;
- }
-
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3465,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -3192,13 +3481,30 @@ out:
static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
struct btrfs_ioctl_vol_args *vol_args;
+ bool restore_op = false;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
+ if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+ /*
+ * We can do the device add because we have a paused balanced,
+ * change the exclusive op type and remember we should bring
+ * back the paused balance
+ */
+ fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
+ btrfs_exclop_start_unlock(fs_info);
+ restore_op = true;
+ }
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
@@ -3214,12 +3520,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- btrfs_exclop_finish(fs_info);
+ if (restore_op)
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ else
+ btrfs_exclop_finish(fs_info);
return ret;
}
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3541,37 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto err_drop;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
+
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- strcmp("cancel", vol_args->name) == 0)
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret)
- goto out;
- /* Exclusive operation is now claimed */
+ goto err_drop;
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3271,54 +3583,62 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
struct block_device *bdev = NULL;
fmode_t mode;
int ret;
- bool cancel;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out_drop_write;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- cancel = (strcmp("cancel", vol_args->name) == 0);
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
- kfree(vol_args);
-out_drop_write:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3379,22 +3699,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -3656,7 +3975,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
{
struct btrfs_trans_handle *trans;
u64 transid;
- int ret;
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -3668,11 +3986,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ btrfs_commit_transaction_async(trans);
out:
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -3703,6 +4017,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
@@ -3802,6 +4121,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
p = memdup_user(arg, sizeof(*p));
if (IS_ERR(p))
return PTR_ERR(p);
@@ -4019,6 +4343,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
+ if (!arg)
+ btrfs_warn(fs_info,
+ "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4091,6 +4419,7 @@ locked:
spin_lock(&fs_info->balance_lock);
bctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
goto do_balance;
}
@@ -4430,7 +4759,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args qsa = {0};
- int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4441,9 +4769,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, &qsa, sizeof(qsa)))
- ret = -EFAULT;
+ return -EFAULT;
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
@@ -4859,7 +5187,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4889,11 +5217,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(file, arg);
+ ret = btrfs_ioctl_send(inode, arg);
kfree(arg);
return ret;
}
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+ bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+ if (args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ if (ret >= 0) {
+ fsnotify_access(file);
+ if (copy_to_user(argp + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel))
+ ret = -EFAULT;
+ }
+
+out_iov:
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, argp, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+ args.len = args32.len;
+ args.unencoded_len = args32.unencoded_len;
+ args.unencoded_offset = args32.unencoded_offset;
+ args.compression = args32.compression;
+ args.encryption = args32.encryption;
+ memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
+ goto out_acct;
+ if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (args.unencoded_offset > args.unencoded_len)
+ goto out_acct;
+ if (args.len > args.unencoded_len - args.unencoded_offset)
+ goto out_acct;
+
+ ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ file_start_write(file);
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_end_write;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, args.len);
+ if (ret < 0)
+ goto out_end_write;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0);
+ if (ret)
+ goto out_end_write;
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_do_write_iter(&kiocb, &iter, &args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+out_end_write:
+ file_end_write(file);
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4904,7 +5415,7 @@ long btrfs_ioctl(struct file *file, unsigned int
switch (cmd) {
case FS_IOC_GETVERSION:
- return btrfs_ioctl_getversion(file, argp);
+ return btrfs_ioctl_getversion(inode, argp);
case FS_IOC_GETFSLABEL:
return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
@@ -4924,7 +5435,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(file, argp);
+ return btrfs_ioctl_subvol_getflags(inode, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -4945,14 +5456,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
- case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(file, argp);
+ return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(file, argp);
+ return btrfs_ioctl_tree_search_v2(inode, argp);
case BTRFS_IOC_INO_LOOKUP:
- return btrfs_ioctl_ino_lookup(file, argp);
+ return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
@@ -4999,10 +5508,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(file, argp, false);
+ return _btrfs_ioctl_send(inode, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(file, argp, true);
+ return _btrfs_ioctl_send(inode, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5029,15 +5538,25 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
- return btrfs_ioctl_get_subvol_info(file, argp);
+ return btrfs_ioctl_get_subvol_info(inode, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
- return btrfs_ioctl_get_subvol_rootref(file, argp);
+ return btrfs_ioctl_get_subvol_rootref(root, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
case FS_IOC_ENABLE_VERITY:
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
+ case BTRFS_IOC_ENCODED_READ:
+ return btrfs_ioctl_encoded_read(file, argp, false);
+ case BTRFS_IOC_ENCODED_WRITE:
+ return btrfs_ioctl_encoded_write(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+ return btrfs_ioctl_encoded_read(file, argp, true);
+ case BTRFS_IOC_ENCODED_WRITE_32:
+ return btrfs_ioctl_encoded_write(file, argp, true);
+#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index a2e1f1f5c6e3..bbc45534ae9a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
}
#else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 3dbe6eb5fda7..430ad36b8b08 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
- * One segment represents at most one page of uncompressed data.
+ * One segment represents at most one sector of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
- * Segment header never crosses page boundary, thus it's possible to
- * have at most 3 padding zeros at the end of the page.
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
*
* 2.2 Data Payload
- * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- * which is 4419 for a 4KiB page.
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
*
- * Example:
+ * Example with 4K sectorsize:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
@@ -55,6 +55,9 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
+#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -112,170 +115,187 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ unsigned long max_nr_page,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap(cur_page);
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap(cur_page);
+ return 0;
+}
+
int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
+ const unsigned long max_nr_page = *out_pages;
int ret = 0;
- char *data_in;
- char *cpage_out, *sizes_ptr;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
- unsigned long bytes_left;
- unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- size_t in_len;
- size_t out_len;
- char *buf;
- unsigned long tot_in = 0;
- unsigned long tot_out = 0;
- unsigned long pg_bytes_left;
- unsigned long out_offset;
- unsigned long bytes;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
+ ASSERT(max_nr_page > 0);
*out_pages = 0;
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
-
/*
- * store the size of all chunks of compressed data in
- * the first 4 bytes
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
*/
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- out_offset = LZO_LEN;
- tot_out = LZO_LEN;
- pages[0] = out_page;
- nr_pages = 1;
- pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
- /* compress at most one page of data each time */
- in_len = min(len, PAGE_SIZE);
- while (tot_in < len) {
- ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- &out_len, workspace->mem);
- if (ret != LZO_E_OK) {
- pr_debug("BTRFS: lzo in loop returned %d\n",
- ret);
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap(page_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
goto out;
}
- /* store the size of this chunk of compressed data */
- write_compress_length(cpage_out + out_offset, out_len);
- tot_out += LZO_LEN;
- out_offset += LZO_LEN;
- pg_bytes_left -= LZO_LEN;
-
- tot_in += in_len;
- tot_out += out_len;
-
- /* copy bytes from the working buffer into the pages */
- buf = workspace->cbuf;
- while (out_len) {
- bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
- memcpy(cpage_out + out_offset, buf, bytes);
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, max_nr_page,
+ &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
- out_len -= bytes;
- pg_bytes_left -= bytes;
- buf += bytes;
- out_offset += bytes;
+ cur_in += in_len;
- /*
- * we need another page for writing out.
- *
- * Note if there's less than 4 bytes left, we just
- * skip to a new page.
- */
- if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- pg_bytes_left == 0) {
- if (pg_bytes_left) {
- memset(cpage_out + out_offset, 0,
- pg_bytes_left);
- tot_out += pg_bytes_left;
- }
-
- /* we're done, don't allocate new page */
- if (out_len == 0 && tot_in >= len)
- break;
-
- kunmap(out_page);
- if (nr_pages == nr_dest_pages) {
- out_page = NULL;
- ret = -E2BIG;
- goto out;
- }
-
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- pages[nr_pages++] = out_page;
-
- pg_bytes_left = PAGE_SIZE;
- out_offset = 0;
- }
- }
-
- /* we're making it bigger, give up */
- if (tot_in > 8192 && tot_in < tot_out) {
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
ret = -E2BIG;
goto out;
}
- /* we're all done */
- if (tot_in >= len)
- break;
-
- if (tot_out > max_out)
- break;
-
- bytes_left = len - tot_in;
- kunmap(in_page);
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
- in_len = min(bytes_left, PAGE_SIZE);
- }
-
- if (tot_out >= tot_in) {
- ret = -E2BIG;
- goto out;
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
}
- /* store the size of all chunks of compressed data */
+ /* Store the size of all chunks of compressed data */
sizes_ptr = kmap_local_page(pages[0]);
- write_compress_length(sizes_ptr, tot_out);
+ write_compress_length(sizes_ptr, cur_out);
kunmap_local(sizes_ptr);
ret = 0;
- *total_out = tot_out;
- *total_in = tot_in;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
out:
- *out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
- put_page(in_page);
- }
-
+ if (page_in)
+ put_page(page_in);
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -357,11 +377,23 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
- kaddr = kmap(cur_page);
ASSERT(cur_page);
+ kaddr = kmap(cur_page);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap(cur_page);
cur_in += LZO_LEN;
+ if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
+ }
+
/* Copy the compressed segment payload into workspace */
copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
@@ -404,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
+ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
char *kaddr;
unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6b51fd2ec5ac..1957b14b329a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/*
- * Allocate and add a new ordered_extent into the per-inode tree.
+/**
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
*
- * The tree is given a single reference on the ordered extent that was
- * inserted.
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted.
+ *
+ * Return: 0 or -ENOMEM.
*/
-static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type, int dio,
- int compress_type)
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
struct btrfs_ordered_extent *entry;
int ret;
- if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
@@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return -ENOMEM;
entry->file_offset = file_offset;
- entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
+ entry->ram_bytes = ram_bytes;
+ entry->disk_bytenr = disk_bytenr;
entry->disk_num_bytes = disk_num_bytes;
+ entry->offset = offset;
entry->bytes_left = num_bytes;
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
@@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC ||
- type == BTRFS_ORDERED_COMPRESSED);
- set_bit(type, &entry->flags);
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ entry->flags = flags;
percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
fs_info->delalloc_batch);
- if (dio)
- set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
-
/* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return 0;
}
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 1,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type)
-{
- ASSERT(compress_type != BTRFS_COMPRESS_NONE);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes,
- BTRFS_ORDERED_COMPRESSED, 0,
- compress_type);
-}
-
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
- if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
- false);
+ if (root != fs_info->tree_root) {
+ u64 release;
+
+ if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
+ release = entry->disk_num_bytes;
+ else
+ release = entry->num_bytes;
+ btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ }
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
@@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 file_offset = ordered->file_offset + pos;
u64 disk_bytenr = ordered->disk_bytenr + pos;
- u64 num_bytes = len;
- u64 disk_num_bytes = len;
- int type;
- unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
- int compress_type = ordered->compress_type;
- unsigned long weight;
- int ret;
-
- weight = hweight_long(flags_masked);
- WARN_ON_ONCE(weight > 1);
- if (!weight)
- type = 0;
- else
- type = __ffs(flags_masked);
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
/*
- * The splitting extent is already counted and will be added again
- * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
- * double counting.
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
*/
- percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
fs_info->delalloc_batch);
- if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
- WARN_ON_ONCE(1);
- ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
- file_offset, disk_bytenr, num_bytes,
- disk_num_bytes, compress_type);
- } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- } else {
- ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- }
-
- return ret;
+ WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
}
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4194e960ff61..ecad67a2c745 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,8 +74,18 @@ enum {
BTRFS_ORDERED_LOGGED_CSUM,
/* We wait for this extent to complete in the current transaction */
BTRFS_ORDERED_PENDING,
+ /* BTRFS_IOC_ENCODED_WRITE */
+ BTRFS_ORDERED_ENCODED,
};
+/* BTRFS_ORDERED_* flags that specify the type of the extent. */
+#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
+ (1UL << BTRFS_ORDERED_NOCOW) | \
+ (1UL << BTRFS_ORDERED_PREALLOC) | \
+ (1UL << BTRFS_ORDERED_COMPRESSED) | \
+ (1UL << BTRFS_ORDERED_DIRECT) | \
+ (1UL << BTRFS_ORDERED_ENCODED))
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -84,9 +94,11 @@ struct btrfs_ordered_extent {
* These fields directly correspond to the same fields in
* btrfs_file_extent_item.
*/
- u64 disk_bytenr;
u64 num_bytes;
+ u64 ram_bytes;
+ u64 disk_bytenr;
u64 disk_num_bytes;
+ u64 offset;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type);
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type);
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aae1027bd76a..dd8777872143 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
{ BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
};
@@ -85,7 +86,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u64 flags;
u64 offset;
int ref_index = 0;
@@ -200,7 +201,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
- struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
struct btrfs_inode_item *ii;
@@ -224,12 +224,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
- item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
i, key.objectid, type, key.offset,
- btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
@@ -347,7 +346,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_UUID_KEY_SUBVOL:
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
- btrfs_item_size_nr(l, i));
+ btrfs_item_size(l, i));
break;
}
}
@@ -393,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
- if (IS_ERR(next)) {
+ if (IS_ERR(next))
continue;
- } else if (!extent_buffer_uptodate(next)) {
+ if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
continue;
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b1cb5a8c2999..1b31481f9e72 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
struct prop_handler {
struct hlist_node node;
const char *xattr_name;
- int (*validate)(const char *value, size_t len);
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -55,7 +57,8 @@ find_prop_handler(const char *name,
return NULL;
}
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
{
const struct prop_handler *handler;
@@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
if (value_len == 0)
return 0;
- return handler->validate(value, value_len);
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
}
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -158,7 +183,7 @@ static int iterate_object_props(struct btrfs_root *root,
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
cur = 0;
- total_len = btrfs_item_size_nr(leaf, slot);
+ total_len = btrfs_item_size(leaf, slot);
while (cur < total_len) {
u32 name_len = btrfs_dir_name_len(leaf, di);
@@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
return ret;
}
-static int prop_compression_validate(const char *value, size_t len)
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
if (!value)
return 0;
@@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return 0;
}
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
@@ -330,6 +375,7 @@ static struct prop_handler prop_handlers[] = {
.validate = prop_compression_validate,
.apply = prop_compression_apply,
.extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
.inheritable = 1
},
};
@@ -356,6 +402,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
value = h->extract(parent);
if (!value)
continue;
@@ -364,7 +413,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(value, strlen(value));
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
if (ret)
continue;
@@ -377,8 +426,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
*/
if (need_reserve) {
num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_block_rsv_add(fs_info, trans->block_rsv,
+ num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret)
return ret;
}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 40b2c65b518c..59bea741cfcf 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -13,7 +13,9 @@ void __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index db680f5be745..1866b1f0da01 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -25,18 +25,6 @@
#include "sysfs.h"
#include "tree-mod-log.h"
-/* TODO XXX FIXME
- * - subvol delete -> delete when ref goes to 0? delete limits also?
- * - reorganize keys
- * - compressed
- * - sync
- * - copy also limits on subvol creation
- * - limit
- * - caches for ulists
- * - performance benchmarks
- * - check all ioctl parameters
- */
-
/*
* Helpers to access qgroup reservation
*
@@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return 0;
}
-/* must be called with qgroup_lock held */
-static int add_relation_rb(struct btrfs_fs_info *fs_info,
- u64 memberid, u64 parentid)
+/*
+ * Add relation specified by two qgroups.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the qgroups is NULL
+ * <0 other errors
+ */
+static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup *member;
- struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
- member = find_qgroup_rb(fs_info, memberid);
- parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
@@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
return 0;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add relation specified by two qgoup ids.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the ids does not exist
+ * <0 other errors
+ */
+static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+
+ return __add_relation_rb(member, parent);
+}
+
+/* Must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
@@ -940,6 +951,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "qgroups are currently unsupported in extent tree v2");
+ return -EINVAL;
+ }
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -1117,8 +1142,19 @@ out_add_root:
goto out_free_path;
}
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1166,12 +1202,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+
+ /*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
@@ -1186,14 +1244,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
if (!fs_info->quota_root)
goto out;
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1219,7 +1276,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
btrfs_put_root(quota_root);
@@ -1410,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = add_relation_rb(fs_info, src, dst);
+ ret = __add_relation_rb(member, parent);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
@@ -3141,6 +3199,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
@@ -3150,7 +3209,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int ret;
mutex_lock(&fs_info->qgroup_rescan_lock);
- ret = btrfs_search_slot_for_read(fs_info->extent_root,
+ extent_root = btrfs_extent_root(fs_info,
+ fs_info->qgroup_rescan_progress.objectid);
+ ret = btrfs_search_slot_for_read(extent_root,
&fs_info->qgroup_rescan_progress,
path, 1, 0);
@@ -3224,7 +3285,8 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3254,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3272,7 +3332,7 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3360,6 +3420,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8d268ca8aa7..0e239a4c3b26 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
};
struct btrfs_raid_bio {
- struct btrfs_fs_info *fs_info;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+ btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bbio->raid_map[0];
+ u64 num = rbio->bioc->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bbio->raid_map[0] !=
- cur->bbio->raid_map[0])
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
- h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
int keep_cache = 0;
bucket = rbio_bucket(rbio);
- h = rbio->fs_info->stripe_hash_table->table + bucket;
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- btrfs_put_bbio(rbio->bbio);
+ btrfs_put_bioc(rbio->bioc);
kfree(rbio);
}
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *extra;
if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bbio->max_errors;
+ 0 : rbio->bioc->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_bio *bbio,
+ struct btrfs_io_context *bioc,
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
int num_pages = rbio_nr_pages(stripe_len, real_stripes);
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
- rbio->bbio = bbio;
- rbio->fs_info = fs_info;
+ rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
u64 disk_start;
- stripe = &rbio->bbio->stripes[stripe_nr];
+ stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_io_bio(bio)->device = stripe->dev;
+ bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
int i = 0;
start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bbio->raid_map[0];
+ stripe_offset = start - rbio->bioc->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ bio->bi_iter = btrfs_bio(bio)->iter;
bio_for_each_segment(bvec, bio, iter) {
rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
}
- if (likely(!bbio->num_tgtdevs))
+ if (likely(!bioc->num_tgtdevs))
goto write_data;
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bbio->tgtdev_map[stripe])
+ if (!bioc->tgtdev_map[stripe])
continue;
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bbio->tgtdev_map[stripe],
+ rbio->bioc->tgtdev_map[stripe],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
{
u64 physical = bio->bi_iter.bi_sector;
int i;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
physical <<= 9;
- for (i = 0; i < rbio->bbio->num_stripes; i++) {
- stripe = &rbio->bbio->stripes[i];
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
int i;
for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bbio->raid_map[i];
+ u64 stripe_start = rbio->bioc->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
/*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bbio->raid_map[faila] ==
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
__raid_recover_end_io(rbio);
return 0;
} else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2116,22 +2114,22 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int ret;
if (generic_io) {
- ASSERT(bbio->mirror_num == mirror_num);
- btrfs_io_bio(bio)->mirror_num = mirror_num;
+ ASSERT(bioc->mirror_num == mirror_num);
+ btrfs_bio(bio)->mirror_num = mirror_num;
}
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
- "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bbio->map_type);
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(rbio);
return -EIO;
}
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
- btrfs_get_bbio(bbio);
+ btrfs_get_bioc(bioc);
}
/*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
- * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
- if (bbio->stripes[i].dev == scrub_dev) {
+ if (bioc->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
- * We have already increased bio_counter when getting bbio, record it
+ * We have already increased bio_counter when getting bioc, record it
* so we can free it at rbio_orig_end_io().
*/
rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->bbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
else
BUG();
- if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
}
@@ -2435,7 +2433,7 @@ writeback:
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list, page,
- bbio->tgtdev_map[rbio->scrubp],
+ bioc->tgtdev_map[rbio->scrubp],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
- if (dfail > rbio->bbio->max_errors - 1)
+ if (dfail > rbio->bioc->max_errors - 1)
goto cleanup;
/*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bbio, length);
+ rbio = alloc_rbio(fs_info, bioc, length);
if (IS_ERR(rbio))
return NULL;
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * When we get bbio, we have already increased bio_counter, record it
+ * When we get bioc, we have already increased bio_counter, record it
* so we can free it at rbio_orig_end_io()
*/
rbio->generic_bio_cnt = 1;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2503485db859..72c00fc284b5 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
deleted file mode 100644
index 06713a8fe26b..000000000000
--- a/fs/btrfs/reada.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2011 STRATO. All rights reserved.
- */
-
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include "ctree.h"
-#include "volumes.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "dev-replace.h"
-#include "block-group.h"
-
-#undef DEBUG
-
-/*
- * This is the implementation for the generic read ahead framework.
- *
- * To trigger a readahead, btrfs_reada_add must be called. It will start
- * a read ahead for the given range [start, end) on tree root. The returned
- * handle can either be used to wait on the readahead to finish
- * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
- *
- * The read ahead works as follows:
- * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
- * reada_start_machine will then search for extents to prefetch and trigger
- * some reads. When a read finishes for a node, all contained node/leaf
- * pointers that lie in the given range will also be enqueued. The reads will
- * be triggered in sequential order, thus giving a big win over a naive
- * enumeration. It will also make use of multi-device layouts. Each disk
- * will have its on read pointer and all disks will by utilized in parallel.
- * Also will no two disks read both sides of a mirror simultaneously, as this
- * would waste seeking capacity. Instead both disks will read different parts
- * of the filesystem.
- * Any number of readaheads can be started in parallel. The read order will be
- * determined globally, i.e. 2 parallel readaheads will normally finish faster
- * than the 2 started one after another.
- */
-
-#define MAX_IN_FLIGHT 6
-
-struct reada_extctl {
- struct list_head list;
- struct reada_control *rc;
- u64 generation;
-};
-
-struct reada_extent {
- u64 logical;
- u64 owner_root;
- struct btrfs_key top;
- struct list_head extctl;
- int refcnt;
- spinlock_t lock;
- struct reada_zone *zones[BTRFS_MAX_MIRRORS];
- int nzones;
- int scheduled;
- int level;
-};
-
-struct reada_zone {
- u64 start;
- u64 end;
- u64 elems;
- struct list_head list;
- spinlock_t lock;
- int locked;
- struct btrfs_device *device;
- struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
- * self */
- int ndevs;
- struct kref refcnt;
-};
-
-struct reada_machine_work {
- struct btrfs_work work;
- struct btrfs_fs_info *fs_info;
-};
-
-static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
-static void reada_control_release(struct kref *kref);
-static void reada_zone_release(struct kref *kref);
-static void reada_start_machine(struct btrfs_fs_info *fs_info);
-static void __reada_start_machine(struct btrfs_fs_info *fs_info);
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 owner_root,
- u64 generation, int level);
-
-/* recurses */
-/* in case of err, eb might be NULL */
-static void __readahead_hook(struct btrfs_fs_info *fs_info,
- struct reada_extent *re, struct extent_buffer *eb,
- int err)
-{
- int nritems;
- int i;
- u64 bytenr;
- u64 generation;
- struct list_head list;
-
- spin_lock(&re->lock);
- /*
- * just take the full list from the extent. afterwards we
- * don't need the lock anymore
- */
- list_replace_init(&re->extctl, &list);
- re->scheduled = 0;
- spin_unlock(&re->lock);
-
- /*
- * this is the error case, the extent buffer has not been
- * read correctly. We won't access anything from it and
- * just cleanup our data structures. Effectively this will
- * cut the branch below this node from read ahead.
- */
- if (err)
- goto cleanup;
-
- /*
- * FIXME: currently we just set nritems to 0 if this is a leaf,
- * effectively ignoring the content. In a next step we could
- * trigger more readahead depending from the content, e.g.
- * fetch the checksums for the extents in the leaf.
- */
- if (!btrfs_header_level(eb))
- goto cleanup;
-
- nritems = btrfs_header_nritems(eb);
- generation = btrfs_header_generation(eb);
- for (i = 0; i < nritems; i++) {
- struct reada_extctl *rec;
- u64 n_gen;
- struct btrfs_key key;
- struct btrfs_key next_key;
-
- btrfs_node_key_to_cpu(eb, &key, i);
- if (i + 1 < nritems)
- btrfs_node_key_to_cpu(eb, &next_key, i + 1);
- else
- next_key = re->top;
- bytenr = btrfs_node_blockptr(eb, i);
- n_gen = btrfs_node_ptr_generation(eb, i);
-
- list_for_each_entry(rec, &list, list) {
- struct reada_control *rc = rec->rc;
-
- /*
- * if the generation doesn't match, just ignore this
- * extctl. This will probably cut off a branch from
- * prefetch. Alternatively one could start a new (sub-)
- * prefetch for this branch, starting again from root.
- * FIXME: move the generation check out of this loop
- */
-#ifdef DEBUG
- if (rec->generation != generation) {
- btrfs_debug(fs_info,
- "generation mismatch for (%llu,%d,%llu) %llu != %llu",
- key.objectid, key.type, key.offset,
- rec->generation, generation);
- }
-#endif
- if (rec->generation == generation &&
- btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
- btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key,
- btrfs_header_owner(eb), n_gen,
- btrfs_header_level(eb) - 1);
- }
- }
-
-cleanup:
- /*
- * free extctl records
- */
- while (!list_empty(&list)) {
- struct reada_control *rc;
- struct reada_extctl *rec;
-
- rec = list_first_entry(&list, struct reada_extctl, list);
- list_del(&rec->list);
- rc = rec->rc;
- kfree(rec);
-
- kref_get(&rc->refcnt);
- if (atomic_dec_and_test(&rc->elems)) {
- kref_put(&rc->refcnt, reada_control_release);
- wake_up(&rc->wait);
- }
- kref_put(&rc->refcnt, reada_control_release);
-
- reada_extent_put(fs_info, re); /* one ref for each entry */
- }
-
- return;
-}
-
-int btree_readahead_hook(struct extent_buffer *eb, int err)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int ret = 0;
- struct reada_extent *re;
-
- /* find extent */
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree,
- eb->start >> fs_info->sectorsize_bits);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- if (!re) {
- ret = -1;
- goto start_machine;
- }
-
- __readahead_hook(fs_info, re, eb, err);
- reada_extent_put(fs_info, re); /* our ref */
-
-start_machine:
- reada_start_machine(fs_info);
- return ret;
-}
-
-static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_bio *bbio)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- int ret;
- struct reada_zone *zone;
- struct btrfs_block_group *cache = NULL;
- u64 start;
- u64 end;
- int i;
-
- zone = NULL;
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> fs_info->sectorsize_bits, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end) {
- kref_get(&zone->refcnt);
- spin_unlock(&fs_info->reada_lock);
- return zone;
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- cache = btrfs_lookup_block_group(fs_info, logical);
- if (!cache)
- return NULL;
-
- start = cache->start;
- end = start + cache->length - 1;
- btrfs_put_block_group(cache);
-
- zone = kzalloc(sizeof(*zone), GFP_KERNEL);
- if (!zone)
- return NULL;
-
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- kfree(zone);
- return NULL;
- }
-
- zone->start = start;
- zone->end = end;
- INIT_LIST_HEAD(&zone->list);
- spin_lock_init(&zone->lock);
- zone->locked = 0;
- kref_init(&zone->refcnt);
- zone->elems = 0;
- zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bbio->num_stripes; ++i) {
- /* bounds have already been checked */
- zone->devs[i] = bbio->stripes[i].dev;
- }
- zone->ndevs = bbio->num_stripes;
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> fs_info->sectorsize_bits),
- zone);
-
- if (ret == -EEXIST) {
- kfree(zone);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> fs_info->sectorsize_bits, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end)
- kref_get(&zone->refcnt);
- else
- zone = NULL;
- }
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
-
- return zone;
-}
-
-static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
- u64 logical,
- struct btrfs_key *top,
- u64 owner_root, int level)
-{
- int ret;
- struct reada_extent *re = NULL;
- struct reada_extent *re_exist = NULL;
- struct btrfs_bio *bbio = NULL;
- struct btrfs_device *dev;
- struct btrfs_device *prev_dev;
- u64 length;
- int real_stripes;
- int nzones = 0;
- unsigned long index = logical >> fs_info->sectorsize_bits;
- int dev_replace_is_ongoing;
- int have_zone = 0;
-
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree, index);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
-
- if (re)
- return re;
-
- re = kzalloc(sizeof(*re), GFP_KERNEL);
- if (!re)
- return NULL;
-
- re->logical = logical;
- re->top = *top;
- INIT_LIST_HEAD(&re->extctl);
- spin_lock_init(&re->lock);
- re->refcnt = 1;
- re->owner_root = owner_root;
- re->level = level;
-
- /*
- * map block
- */
- length = fs_info->nodesize;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0);
- if (ret || !bbio || length < fs_info->nodesize)
- goto error;
-
- if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
- btrfs_err(fs_info,
- "readahead: more than %d copies not supported",
- BTRFS_MAX_MIRRORS);
- goto error;
- }
-
- real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
- for (nzones = 0; nzones < real_stripes; ++nzones) {
- struct reada_zone *zone;
-
- dev = bbio->stripes[nzones].dev;
-
- /* cannot read ahead on missing device. */
- if (!dev->bdev)
- continue;
-
- zone = reada_find_zone(dev, logical, bbio);
- if (!zone)
- continue;
-
- re->zones[re->nzones++] = zone;
- spin_lock(&zone->lock);
- if (!zone->elems)
- kref_get(&zone->refcnt);
- ++zone->elems;
- spin_unlock(&zone->lock);
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- if (re->nzones == 0) {
- /* not a single zone found, error and out */
- goto error;
- }
-
- /* Insert extent in reada tree + all per-device trees, all or nothing */
- down_read(&fs_info->dev_replace.rwsem);
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&fs_info->reada_tree, index, re);
- if (ret == -EEXIST) {
- re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
- re_exist->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- if (ret) {
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- radix_tree_preload_end();
- prev_dev = NULL;
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
- &fs_info->dev_replace);
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- dev = re->zones[nzones]->device;
-
- if (dev == prev_dev) {
- /*
- * in case of DUP, just add the first zone. As both
- * are on the same device, there's nothing to gain
- * from adding both.
- * Also, it wouldn't work, as the tree is per device
- * and adding would fail with EEXIST
- */
- continue;
- }
- if (!dev->bdev)
- continue;
-
- if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
- continue;
-
- if (dev_replace_is_ongoing &&
- dev == fs_info->dev_replace.tgtdev) {
- /*
- * as this device is selected for reading only as
- * a last resort, skip it for read ahead.
- */
- continue;
- }
- prev_dev = dev;
- ret = radix_tree_insert(&dev->reada_extents, index, re);
- if (ret) {
- while (--nzones >= 0) {
- dev = re->zones[nzones]->device;
- BUG_ON(dev == NULL);
- /* ignore whether the entry was inserted */
- radix_tree_delete(&dev->reada_extents, index);
- }
- radix_tree_delete(&fs_info->reada_tree, index);
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- have_zone = 1;
- }
- if (!have_zone)
- radix_tree_delete(&fs_info->reada_tree, index);
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
-
- if (!have_zone)
- goto error;
-
- btrfs_put_bbio(bbio);
- return re;
-
-error:
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- struct reada_zone *zone;
-
- zone = re->zones[nzones];
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /*
- * no fs_info->reada_lock needed, as this can't be
- * the last ref
- */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- btrfs_put_bbio(bbio);
- kfree(re);
- return re_exist;
-}
-
-static void reada_extent_put(struct btrfs_fs_info *fs_info,
- struct reada_extent *re)
-{
- int i;
- unsigned long index = re->logical >> fs_info->sectorsize_bits;
-
- spin_lock(&fs_info->reada_lock);
- if (--re->refcnt) {
- spin_unlock(&fs_info->reada_lock);
- return;
- }
-
- radix_tree_delete(&fs_info->reada_tree, index);
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- radix_tree_delete(&zone->device->reada_extents, index);
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /* no fs_info->reada_lock needed, as this can't be
- * the last ref */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
-
- kfree(re);
-}
-
-static void reada_zone_release(struct kref *kref)
-{
- struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
- struct btrfs_fs_info *fs_info = zone->device->fs_info;
-
- lockdep_assert_held(&fs_info->reada_lock);
-
- radix_tree_delete(&zone->device->reada_zones,
- zone->end >> fs_info->sectorsize_bits);
-
- kfree(zone);
-}
-
-static void reada_control_release(struct kref *kref)
-{
- struct reada_control *rc = container_of(kref, struct reada_control,
- refcnt);
-
- kfree(rc);
-}
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 owner_root,
- u64 generation, int level)
-{
- struct btrfs_fs_info *fs_info = rc->fs_info;
- struct reada_extent *re;
- struct reada_extctl *rec;
-
- /* takes one ref */
- re = reada_find_extent(fs_info, logical, top, owner_root, level);
- if (!re)
- return -1;
-
- rec = kzalloc(sizeof(*rec), GFP_KERNEL);
- if (!rec) {
- reada_extent_put(fs_info, re);
- return -ENOMEM;
- }
-
- rec->rc = rc;
- rec->generation = generation;
- atomic_inc(&rc->elems);
-
- spin_lock(&re->lock);
- list_add_tail(&rec->list, &re->extctl);
- spin_unlock(&re->lock);
-
- /* leave the ref on the extent */
-
- return 0;
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
-{
- int i;
- unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
-
- for (i = 0; i < zone->ndevs; ++i) {
- struct reada_zone *peer;
- peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
- if (peer && peer->device != zone->device)
- peer->locked = lock;
- }
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static int reada_pick_zone(struct btrfs_device *dev)
-{
- struct reada_zone *top_zone = NULL;
- struct reada_zone *top_locked_zone = NULL;
- u64 top_elems = 0;
- u64 top_locked_elems = 0;
- unsigned long index = 0;
- int ret;
-
- if (dev->reada_curr_zone) {
- reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
- kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
- dev->reada_curr_zone = NULL;
- }
- /* pick the zone with the most elements */
- while (1) {
- struct reada_zone *zone;
-
- ret = radix_tree_gang_lookup(&dev->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
- if (zone->locked) {
- if (zone->elems > top_locked_elems) {
- top_locked_elems = zone->elems;
- top_locked_zone = zone;
- }
- } else {
- if (zone->elems > top_elems) {
- top_elems = zone->elems;
- top_zone = zone;
- }
- }
- }
- if (top_zone)
- dev->reada_curr_zone = top_zone;
- else if (top_locked_zone)
- dev->reada_curr_zone = top_locked_zone;
- else
- return 0;
-
- dev->reada_next = dev->reada_curr_zone->start;
- kref_get(&dev->reada_curr_zone->refcnt);
- reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
-
- return 1;
-}
-
-static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, int level, int mirror_num,
- struct extent_buffer **eb)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
- if (IS_ERR(buf))
- return 0;
-
- set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
- ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
- if (ret) {
- free_extent_buffer_stale(buf);
- return ret;
- }
-
- if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
- free_extent_buffer_stale(buf);
- return -EIO;
- } else if (extent_buffer_uptodate(buf)) {
- *eb = buf;
- } else {
- free_extent_buffer(buf);
- }
- return 0;
-}
-
-static int reada_start_machine_dev(struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- struct reada_extent *re = NULL;
- int mirror_num = 0;
- struct extent_buffer *eb = NULL;
- u64 logical;
- int ret;
- int i;
-
- spin_lock(&fs_info->reada_lock);
- if (dev->reada_curr_zone == NULL) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- }
- /*
- * FIXME currently we issue the reads one extent at a time. If we have
- * a contiguous block of extents, we could also coagulate them or use
- * plugging to speed things up
- */
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> fs_info->sectorsize_bits, 1);
- if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- re = NULL;
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> fs_info->sectorsize_bits, 1);
- }
- if (ret == 0) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- dev->reada_next = re->logical + fs_info->nodesize;
- re->refcnt++;
-
- spin_unlock(&fs_info->reada_lock);
-
- spin_lock(&re->lock);
- if (re->scheduled || list_empty(&re->extctl)) {
- spin_unlock(&re->lock);
- reada_extent_put(fs_info, re);
- return 0;
- }
- re->scheduled = 1;
- spin_unlock(&re->lock);
-
- /*
- * find mirror num
- */
- for (i = 0; i < re->nzones; ++i) {
- if (re->zones[i]->device == dev) {
- mirror_num = i + 1;
- break;
- }
- }
- logical = re->logical;
-
- atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
- re->level, mirror_num, &eb);
- if (ret)
- __readahead_hook(fs_info, re, NULL, ret);
- else if (eb)
- __readahead_hook(fs_info, re, eb, ret);
-
- if (eb)
- free_extent_buffer(eb);
-
- atomic_dec(&dev->reada_in_flight);
- reada_extent_put(fs_info, re);
-
- return 1;
-
-}
-
-static void reada_start_machine_worker(struct btrfs_work *work)
-{
- struct reada_machine_work *rmw;
- int old_ioprio;
-
- rmw = container_of(work, struct reada_machine_work, work);
-
- old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
- task_nice_ioprio(current));
- set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(rmw->fs_info);
- set_task_ioprio(current, old_ioprio);
-
- atomic_dec(&rmw->fs_info->reada_works_cnt);
-
- kfree(rmw);
-}
-
-/* Try to start up to 10k READA requests for a group of devices */
-static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
-{
- u64 enqueued;
- u64 total = 0;
- struct btrfs_device *device;
-
- do {
- enqueued = 0;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (atomic_read(&device->reada_in_flight) <
- MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(device);
- }
- total += enqueued;
- } while (enqueued && total < 10000);
-
- return total;
-}
-
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
- int i;
- u64 enqueued = 0;
-
- mutex_lock(&fs_devices->device_list_mutex);
-
- enqueued += reada_start_for_fsdevs(fs_devices);
- list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
- enqueued += reada_start_for_fsdevs(seed_devs);
-
- mutex_unlock(&fs_devices->device_list_mutex);
- if (enqueued == 0)
- return;
-
- /*
- * If everything is already in the cache, this is effectively single
- * threaded. To a) not hold the caller for too long and b) to utilize
- * more cores, we broke the loop above after 10000 iterations and now
- * enqueue to workers to finish it. This will distribute the load to
- * the cores.
- */
- for (i = 0; i < 2; ++i) {
- reada_start_machine(fs_info);
- if (atomic_read(&fs_info->reada_works_cnt) >
- BTRFS_MAX_MIRRORS * 2)
- break;
- }
-}
-
-static void reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct reada_machine_work *rmw;
-
- rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
- if (!rmw) {
- /* FIXME we cannot handle this properly right now */
- BUG();
- }
- btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
- rmw->fs_info = fs_info;
-
- btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
- atomic_inc(&fs_info->reada_works_cnt);
-}
-
-#ifdef DEBUG
-static void dump_devs(struct btrfs_fs_info *fs_info, int all)
-{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- unsigned long index;
- int ret;
- int i;
- int j;
- int cnt;
-
- spin_lock(&fs_info->reada_lock);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
- atomic_read(&device->reada_in_flight));
- index = 0;
- while (1) {
- struct reada_zone *zone;
- ret = radix_tree_gang_lookup(&device->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- pr_debug(" zone %llu-%llu elems %llu locked %d devs",
- zone->start, zone->end, zone->elems,
- zone->locked);
- for (j = 0; j < zone->ndevs; ++j) {
- pr_cont(" %lld",
- zone->devs[j]->devid);
- }
- if (device->reada_curr_zone == zone)
- pr_cont(" curr off %llu",
- device->reada_next - zone->start);
- pr_cont("\n");
- index = (zone->end >> fs_info->sectorsize_bits) + 1;
- }
- cnt = 0;
- index = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&device->reada_extents,
- (void **)&re, index, 1);
- if (ret == 0)
- break;
- pr_debug(" re: logical %llu size %u empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
-
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- if (++cnt > 15)
- break;
- }
- }
-
- index = 0;
- cnt = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
- index, 1);
- if (ret == 0)
- break;
- if (!re->scheduled) {
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- continue;
- }
- pr_debug("re: logical %llu size %u list empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- }
- spin_unlock(&fs_info->reada_lock);
-}
-#endif
-
-/*
- * interface
- */
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *key_start, struct btrfs_key *key_end)
-{
- struct reada_control *rc;
- u64 start;
- u64 generation;
- int ret;
- int level;
- struct extent_buffer *node;
- static struct btrfs_key max_key = {
- .objectid = (u64)-1,
- .type = (u8)-1,
- .offset = (u64)-1
- };
-
- rc = kzalloc(sizeof(*rc), GFP_KERNEL);
- if (!rc)
- return ERR_PTR(-ENOMEM);
-
- rc->fs_info = root->fs_info;
- rc->key_start = *key_start;
- rc->key_end = *key_end;
- atomic_set(&rc->elems, 0);
- init_waitqueue_head(&rc->wait);
- kref_init(&rc->refcnt);
- kref_get(&rc->refcnt); /* one ref for having elements */
-
- node = btrfs_root_node(root);
- start = node->start;
- generation = btrfs_header_generation(node);
- level = btrfs_header_level(node);
- free_extent_buffer(node);
-
- ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
- generation, level);
- if (ret) {
- kfree(rc);
- return ERR_PTR(ret);
- }
-
- reada_start_machine(root->fs_info);
-
- return rc;
-}
-
-#ifdef DEBUG
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- 5 * HZ);
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
- }
-
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#else
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- (HZ + 9) / 10);
- }
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#endif
-
-void btrfs_reada_detach(void *handle)
-{
- struct reada_control *rc = handle;
-
- kref_put(&rc->refcnt, reada_control_release);
-}
-
-/*
- * Before removing a device (device replace or device remove ioctls), call this
- * function to wait for all existing readahead requests on the device and to
- * make sure no one queues more readahead requests for the device.
- *
- * Must be called without holding neither the device list mutex nor the device
- * replace semaphore, otherwise it will deadlock.
- */
-void btrfs_reada_remove_dev(struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
-
- /* Serialize with readahead extent creation at reada_find_extent(). */
- spin_lock(&fs_info->reada_lock);
- set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
- spin_unlock(&fs_info->reada_lock);
-
- /*
- * There might be readahead requests added to the radix trees which
- * were not yet added to the readahead work queue. We need to start
- * them and wait for their completion, otherwise we can end up with
- * use-after-free problems when dropping the last reference on the
- * readahead extents and their zones, as they need to access the
- * device structure.
- */
- reada_start_machine(fs_info);
- btrfs_flush_workqueue(fs_info->readahead_workers);
-}
-
-/*
- * If when removing a device (device replace or device remove ioctls) an error
- * happens after calling btrfs_reada_remove_dev(), call this to undo what that
- * function did. This is safe to call even if btrfs_reada_remove_dev() was not
- * called before.
- */
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
-{
- spin_lock(&dev->fs_info->reada_lock);
- clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
- spin_unlock(&dev->fs_info->reada_lock);
-}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d2062d5f71dd..a248f46cfe72 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct extent_buffer *leaf = path->nodes[0];
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
int type, ret;
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.root;
+ ref_root = generic_ref->tree_ref.owning_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
+ ref_root = generic_ref->data_ref.owning_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
@@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
/* Walk down all roots and build the ref tree, meant to be called at mount */
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *eb;
int tree_block_level = 0;
@@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- eb = btrfs_read_lock_root_node(fs_info->extent_root);
+ extent_root = btrfs_extent_root(fs_info, 0);
+ eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
@@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
* would have had to added a ref key item which may appear on a
* different leaf from the original extent item.
*/
- ret = walk_down_tree(fs_info->extent_root, path, level,
+ ret = walk_down_tree(extent_root, path, level,
&bytenr, &num_bytes, &tree_block_level);
if (ret)
break;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9b0814318e72..998e3f180d90 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
}
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
@@ -277,7 +277,7 @@ copy_inline_extent:
path->slots[0]),
size);
btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(dst));
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
if (!ret && !trans) {
@@ -439,7 +439,7 @@ process_slot:
break;
}
next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
+ size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
@@ -494,7 +494,8 @@ process_slot:
&clone_info, &trans);
if (ret)
goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
/*
* Inline extents always have to start at file offset 0
* and can never be bigger then the sector size. We can
@@ -505,8 +506,12 @@ process_slot:
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
@@ -518,17 +523,22 @@ process_slot:
btrfs_release_path(path);
/*
- * If this is a new extent update the last_reflink_trans of both
- * inodes. This is used by fsync to make sure it does not log
- * multiple checksum items with overlapping ranges. For older
- * extents we don't need to do it since inode logging skips the
- * checksums for older extents. Also ignore holes and inline
- * extents because they don't have checksums in the csum tree.
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
*/
- if (extent_gen == trans->transid && disko > 0) {
+ if (extent_gen == trans->transid && disko > 0)
BTRFS_I(src)->last_reflink_trans = trans->transid;
- BTRFS_I(inode)->last_reflink_trans = trans->transid;
- }
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
@@ -575,8 +585,7 @@ process_slot:
* replaced file extent items.
*/
if (last_dest_end >= i_size_read(inode))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
last_dest_end, destoff + len - 1, NULL, &trans);
@@ -636,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
int ret;
/*
- * Lock destination range to serialize with concurrent readpages() and
+ * Lock destination range to serialize with concurrent readahead() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
@@ -649,7 +658,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
- int ret;
+ int ret = 0;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
@@ -730,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
}
/*
- * Lock destination range to serialize with concurrent readpages() and
+ * Lock destination range to serialize with concurrent readahead() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, off, inode, destoff, len);
@@ -772,9 +781,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (btrfs_root_readonly(root_out))
return -EROFS;
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
+ ASSERT(inode_in->i_sb == inode_out->i_sb);
}
/* Don't make the dst file partly checksummed */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 914d403b4415..fdc2c4b411f0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -25,6 +25,8 @@
#include "backref.h"
#include "misc.h"
#include "subpage.h"
+#include "zoned.h"
+#include "inode-item.h"
/*
* Relocation overview
@@ -1145,9 +1147,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1158,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1369,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1378,8 @@ again:
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1388,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1398,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1735,7 +1737,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
memset(&next_key, 0, sizeof(next_key));
while (1) {
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ min_reserved,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret)
goto out;
@@ -1854,7 +1857,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
@@ -2322,8 +2325,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* If we get an enospc just kick back -EAGAIN so we know to drop the
* transaction and try to refill when we can flush all the things.
*/
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_LIMIT);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
@@ -2473,9 +2476,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
- ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb));
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2596,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, block->owner,
block->key.offset, block->level, NULL);
- if (IS_ERR(eb)) {
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2691,8 +2694,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
@@ -2852,31 +2859,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- /*
- * On a zoned filesystem, we cannot preallocate the file region.
- * Instead, we dirty and fiemap_write the region.
- */
- if (btrfs_is_zoned(inode->root->fs_info)) {
- struct btrfs_root *root = inode->root;
- struct btrfs_trans_handle *trans;
-
- end = cluster->end - offset + 1;
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- i_size_write(&inode->vfs_inode, end);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- return btrfs_end_transaction(trans);
- }
-
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2903,9 +2885,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
- u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
@@ -3016,7 +2997,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len);
+ clamped_len, clamped_len);
if (ret)
goto release_page;
@@ -3084,7 +3065,6 @@ release_page:
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
@@ -3105,7 +3085,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_extent_mapping(inode, cluster->start - offset,
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
@@ -3114,8 +3094,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
- if (btrfs_is_zoned(fs_info) && !ret)
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3173,7 +3151,7 @@ static int add_tree_block(struct reloc_control *rc,
u64 owner = 0;
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
@@ -3574,7 +3552,7 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
RELOCATION_RESERVED_NODES;
- ret = btrfs_block_rsv_refill(rc->extent_root,
+ ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
rc->block_rsv, rc->block_rsv->size,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
@@ -3622,9 +3600,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
while (1) {
rc->reserved_bytes = 0;
- ret = btrfs_block_rsv_refill(rc->extent_root,
- rc->block_rsv, rc->block_rsv->size,
- BTRFS_RESERVE_FLUSH_ALL);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
err = ret;
break;
@@ -3770,12 +3748,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
- if (btrfs_is_zoned(trans->fs_info))
- flags &= ~BTRFS_INODE_PREALLOC;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3790,7 +3764,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -3885,25 +3860,14 @@ out:
* 0 success
* -EINPROGRESS operation is already in progress, that's probably a bug
* -ECANCELED cancellation request was set before the operation started
- * -EAGAIN can not start because there are ongoing send operations
*/
static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
{
- spin_lock(&fs_info->send_reloc_lock);
- if (fs_info->send_in_progress) {
- btrfs_warn_rl(fs_info,
-"cannot run relocation while send operations are in progress (%d in progress)",
- fs_info->send_in_progress);
- spin_unlock(&fs_info->send_reloc_lock);
- return -EAGAIN;
- }
if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
/* This should not happen */
- spin_unlock(&fs_info->send_reloc_lock);
btrfs_err(fs_info, "reloc already running, cannot start");
return -EINPROGRESS;
}
- spin_unlock(&fs_info->send_reloc_lock);
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
@@ -3925,9 +3889,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
- spin_lock(&fs_info->send_reloc_lock);
clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
- spin_unlock(&fs_info->send_reloc_lock);
atomic_set(&fs_info->reloc_cancel_req, 0);
}
@@ -3990,7 +3952,7 @@ static const char *stage_to_string(int stage)
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
struct btrfs_block_group *bg;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
struct reloc_control *rc;
struct inode *inode;
struct btrfs_path *path;
@@ -3998,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ /*
+ * This only gets set if we had a half-deleted snapshot on mount. We
+ * cannot allow relocation to start while we're still trying to clean up
+ * these pending deletions.
+ */
+ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+ if (ret)
+ return ret;
+
+ /* We may have been woken up by close_ctree, so bail if we're closing. */
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
@@ -4063,6 +4038,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->start,
rc->block_group->length);
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
while (1) {
int finishes_stage;
@@ -4145,9 +4123,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
* this function resumes merging reloc trees with corresponding fs trees.
* this is important for keeping the sharing of tree blocks
*/
-int btrfs_recover_relocation(struct btrfs_root *root)
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
@@ -4188,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_tree_root(root, &key);
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
@@ -4238,7 +4215,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_end;
}
- rc->extent_root = fs_info->extent_root;
+ rc->extent_root = btrfs_extent_root(fs_info, 0);
set_reloc_control(rc);
@@ -4329,6 +4306,7 @@ out:
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -4340,7 +4318,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + inode->index_cnt;
- ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
goto out;
@@ -4386,8 +4365,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
if (!rc)
return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
- root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 702dc5441f03..ca7426ef61c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
u32 len;
int need_reset = 0;
- len = btrfs_item_size_nr(eb, slot);
+ len = btrfs_item_size(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
@@ -39,10 +39,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
need_reset = 1;
}
if (need_reset) {
- memset(&item->generation_v2, 0,
- sizeof(*item) - offsetof(struct btrfs_root_item,
- generation_v2));
-
+ /* Clear all members from generation_v2 onwards. */
+ memset_startat(item, 0, generation_v2);
generate_random_guid(item->uuid);
}
}
@@ -148,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
- old_len = btrfs_item_size_nr(l, slot);
+ old_len = btrfs_item_size(l, slot);
/*
* If this is the first time we update the root item which originated
@@ -280,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
+ struct btrfs_key drop_key;
+
+ btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+ /*
+ * If we have a non-zero drop_progress then we know we
+ * made it partly through deleting this snapshot, and
+ * thus we need to make sure we block any balance from
+ * happening until this snapshot is completely dropped.
+ */
+ if (drop_key.objectid != 0 || drop_key.type != 0 ||
+ drop_key.offset != 0) {
+ set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+ set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+ }
+
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
@@ -336,7 +349,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -503,7 +517,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 088641ba7a8e..8cd713d37ad2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -39,25 +39,24 @@ struct scrub_block;
struct scrub_ctx;
/*
- * the following three values only influence the performance.
+ * The following three values only influence the performance.
+ *
* The last one configures the number of parallel and outstanding I/O
- * operations. The first two values configure an upper limit for the number
+ * operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
-#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
+#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */
/*
- * the following value times PAGE_SIZE needs to be large enough to match the
+ * The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
- * Values larger than BTRFS_STRIPE_LEN are not supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
struct scrub_recover {
refcount_t refs;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 map_length;
};
@@ -73,8 +72,8 @@ struct scrub_page {
u64 physical_for_dev_replace;
atomic_t refs;
u8 mirror_num;
- int have_csum:1;
- int io_error:1;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
@@ -88,11 +87,7 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
-#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
- struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
-#else
- struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
-#endif
+ struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
int page_count;
int next_free;
struct btrfs_work work;
@@ -163,7 +158,7 @@ struct scrub_ctx {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_rd_bio;
+ int pages_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -174,7 +169,6 @@ struct scrub_ctx {
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
- int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
struct btrfs_device *wr_tgtdev;
bool flush_all_writes;
@@ -254,7 +248,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
return spage->recover &&
- (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -578,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+ sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -616,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->wr_curr_bio = NULL;
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);
- sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}
@@ -758,7 +751,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
@@ -798,7 +791,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(recover->bbio);
+ btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
@@ -852,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
- if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
- return btrfs_repair_one_zone(fs_info, logical);
+ if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+ return 0;
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
@@ -1027,8 +1020,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other = sblocks_for_recheck + mirror_index;
} else {
struct scrub_recover *r = sblock_bad->pagev[0]->recover;
- int max_allowed = r->bbio->num_stripes -
- r->bbio->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
@@ -1218,14 +1210,14 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
- return (int)bbio->num_stripes;
+ return (int)bioc->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1261,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
u64 flags = original_sblock->pagev[0]->flags;
u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
@@ -1288,7 +1280,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
- bbio = NULL;
+ bioc = NULL;
/*
* With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1288,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < sublen) {
- btrfs_put_bbio(bbio);
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
- recover->bbio = bbio;
+ recover->bioc = bioc;
recover->map_length = mapped_length;
- BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
- nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
@@ -1348,17 +1340,17 @@ leave_nomem:
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
- bbio->map_type,
- bbio->raid_map,
+ bioc->map_type,
+ bioc->raid_map,
mapped_length,
- bbio->num_stripes -
- bbio->num_tgtdevs,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bioc->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
spage->physical_for_dev_replace =
@@ -1401,7 +1393,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ ret = raid56_parity_recover(bio, spage->recover->bioc,
spage->recover->map_length,
mirror_num, 0);
if (ret)
@@ -1423,7 +1415,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
if (!first_page->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1472,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!spage->page);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1554,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage_bad->dev->bdev);
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1668,7 @@ again:
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_bio);
sbio->bio = bio;
}
@@ -1709,7 +1701,7 @@ again:
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_wr_bio)
+ if (sbio->page_count == sctx->pages_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1756,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_ctx *sctx = sbio->sctx;
int i;
- WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+ ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
@@ -2102,7 +2094,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_bio);
sbio->bio = bio;
}
@@ -2136,7 +2128,7 @@ again:
scrub_block_get(sblock); /* one for the page added to the bio */
atomic_inc(&sblock->outstanding_pages);
sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_rd_bio)
+ if (sbio->page_count == sctx->pages_per_bio)
scrub_submit(sctx);
return 0;
@@ -2203,7 +2195,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2211,27 +2203,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
- !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
* there's a bug in scrub_remap_extent()/btrfs_map_block().
*/
- goto bbio_out;
+ goto bioc_out;
}
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc, length);
if (!rbio)
goto rbio_out;
@@ -2249,9 +2241,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
@@ -2298,7 +2290,7 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
sblock->pagev[index] = spage;
spage->sblock = sblock;
@@ -2370,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
struct scrub_ctx *sctx = sbio->sctx;
int i;
- BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+ ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
if (sbio->status) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -2632,7 +2624,7 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
/* For scrub block */
scrub_page_get(spage);
sblock->pagev[index] = spage;
@@ -2826,7 +2818,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 length;
int ret;
@@ -2838,17 +2830,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
- length, sparity->scrub_dev,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2860,9 +2852,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2893,15 +2885,15 @@ static void scrub_parity_put(struct scrub_parity *sparity)
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
- struct btrfs_path *path,
u64 logic_start,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
+ struct btrfs_root *csum_root;
struct btrfs_extent_item *extent;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_path *path;
u64 flags;
int ret;
int slot;
@@ -2920,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
@@ -2929,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
+ btrfs_free_path(path);
return -ENOMEM;
}
@@ -3044,23 +3047,24 @@ again:
extent_len);
mapped_length = extent_len;
- bbio = NULL;
+ bioc = NULL;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bbio,
+ extent_logical, &mapped_length, &bioc,
0);
if (!ret) {
- if (!bbio || mapped_length < extent_len)
+ if (!bioc || mapped_length < extent_len)
ret = -EIO;
}
if (ret) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
- extent_physical = bbio->stripes[0].physical;
- extent_mirror_num = bbio->mirror_num;
- extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+ csum_root = btrfs_csum_root(fs_info, extent_logical);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
extent_logical + extent_len - 1,
@@ -3117,7 +3121,7 @@ out:
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
- btrfs_release_path(path);
+ btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@@ -3162,17 +3166,18 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
}
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- struct btrfs_block_group *cache)
+ int stripe_index, u64 dev_extent_len)
{
- struct btrfs_path *path, *ppath;
+ struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_root *root;
+ struct btrfs_root *csum_root;
struct btrfs_extent_item *extent;
struct blk_plug plug;
+ const u64 chunk_logical = bg->start;
u64 flags;
int ret;
int slot;
@@ -3184,11 +3189,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical_end;
u64 generation;
int mirror_num;
- struct reada_control *reada1;
- struct reada_control *reada2;
struct btrfs_key key;
- struct btrfs_key key_end;
- u64 increment = map->stripe_len;
+ u64 increment;
u64 offset;
u64 extent_logical;
u64 extent_physical;
@@ -3203,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[num].physical;
+ physical = map->stripes[stripe_index].physical;
offset = 0;
- nstripes = div64_u64(length, map->stripe_len);
+ nstripes = div64_u64(dev_extent_len, map->stripe_len);
mirror_num = 1;
increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * num;
+ offset = map->stripe_len * stripe_index;
increment = map->stripe_len * map->num_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (num / map->sub_stripes);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
increment = map->stripe_len * factor;
- mirror_num = num % map->sub_stripes + 1;
+ mirror_num = stripe_index % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- mirror_num = num % map->num_stripes + 1;
+ mirror_num = stripe_index % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- mirror_num = num % map->num_stripes + 1;
+ mirror_num = stripe_index % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, num, map, &offset, NULL);
+ get_raid56_logic_offset(physical, stripe_index, map, &offset,
+ NULL);
increment = map->stripe_len * nr_data_stripes(map);
}
@@ -3229,12 +3232,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- ppath = btrfs_alloc_path();
- if (!ppath) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
@@ -3242,20 +3239,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->reada = READA_FORWARD;
- ppath->search_commit_root = 1;
- ppath->skip_locking = 1;
- /*
- * trigger the readahead for extent tree csum tree and wait for
- * completion. During readahead, the scrub is officially paused
- * to not hold off transaction commits
- */
- logical = base + offset;
+ logical = chunk_logical + offset;
physical_end = physical + nstripes * map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, num,
+ get_raid56_logic_offset(physical_end, stripe_index,
map, &logic_end, NULL);
- logic_end += base;
+ logic_end += chunk_logical;
} else {
logic_end = logical + increment * nstripes;
}
@@ -3263,32 +3254,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- /* FIXME it might be better to start readahead at commit root */
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
- key_end.objectid = logic_end;
- key_end.type = BTRFS_METADATA_ITEM_KEY;
- key_end.offset = (u64)-1;
- reada1 = btrfs_reada_add(root, &key, &key_end);
-
- if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
- key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.type = BTRFS_EXTENT_CSUM_KEY;
- key.offset = logical;
- key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key, &key_end);
- } else {
- reada2 = NULL;
- }
-
- if (!IS_ERR(reada1))
- btrfs_reada_wait(reada1);
- if (!IS_ERR_OR_NULL(reada2))
- btrfs_reada_wait(reada2);
-
+ root = btrfs_extent_root(fs_info, logical);
+ csum_root = btrfs_csum_root(fs_info, logical);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3334,16 +3301,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num, map,
- &logical,
+ ret = get_raid56_logic_offset(physical, stripe_index,
+ map, &logical,
&stripe_logical);
- logical += base;
+ logical += chunk_logical;
if (ret) {
/* it is parity strip */
- stripe_logical += base;
+ stripe_logical += chunk_logical;
stripe_end = stripe_logical + increment;
ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
+ stripe_logical,
stripe_end);
if (ret)
goto out;
@@ -3420,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Continuing would prevent reusing its device extents
* for new block groups for a long time.
*/
- spin_lock(&cache->lock);
- if (cache->removed) {
- spin_unlock(&cache->lock);
+ spin_lock(&bg->lock);
+ if (bg->removed) {
+ spin_unlock(&bg->lock);
ret = 0;
goto out;
}
- spin_unlock(&cache->lock);
+ spin_unlock(&bg->lock);
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
@@ -3505,16 +3472,16 @@ again:
loop:
physical += map->stripe_len;
ret = get_raid56_logic_offset(physical,
- num, map, &logical,
- &stripe_logical);
- logical += base;
+ stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
if (ret && physical < physical_end) {
- stripe_logical += base;
+ stripe_logical += chunk_logical;
stripe_end = stripe_logical +
increment;
ret = scrub_raid56_parity(sctx,
- map, scrub_dev, ppath,
+ map, scrub_dev,
stripe_logical,
stripe_end);
if (ret)
@@ -3544,8 +3511,8 @@ skip:
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[num].physical +
- length;
+ sctx->stat.last_physical = map->stripes[stripe_index].physical +
+ dev_extent_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3561,14 +3528,14 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
- btrfs_free_path(ppath);
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
- ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
- map->stripes[num].physical,
- physical_end);
+ ret2 = sync_write_pointer_for_zoned(sctx,
+ chunk_logical + offset,
+ map->stripes[stripe_index].physical,
+ physical_end);
if (ret2)
ret = ret2;
}
@@ -3577,10 +3544,10 @@ out:
}
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
struct btrfs_device *scrub_dev,
- u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group *cache)
+ u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3590,7 +3557,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
int ret = 0;
read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
read_unlock(&map_tree->lock);
if (!em) {
@@ -3598,26 +3565,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
*/
- spin_lock(&cache->lock);
- if (!cache->removed)
+ spin_lock(&bg->lock);
+ if (!bg->removed)
ret = -EINVAL;
- spin_unlock(&cache->lock);
+ spin_unlock(&bg->lock);
return ret;
}
-
- map = em->map_lookup;
- if (em->start != chunk_offset)
+ if (em->start != bg->start)
goto out;
-
- if (em->len < length)
+ if (em->len < dev_extent_len)
goto out;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length, cache);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
+ dev_extent_len);
if (ret)
goto out;
}
@@ -3655,7 +3620,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- u64 length;
u64 chunk_offset;
int ret = 0;
int ro_set;
@@ -3679,6 +3643,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
+ u64 dev_extent_len;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
@@ -3715,9 +3681,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- length = btrfs_dev_extent_length(l, dev_extent);
+ dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start)
+ if (found_key.offset + dev_extent_len <= start)
goto skip;
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
@@ -3733,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
@@ -3851,13 +3842,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
- dev_replace->cursor_right = found_key.offset + length;
+ dev_replace->cursor_right = found_key.offset + dev_extent_len;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache);
+ ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+ dev_extent_len);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3938,7 +3929,7 @@ skip_unfreeze:
break;
}
skip:
- key.offset = found_key.offset + length;
+ key.offset = found_key.offset + dev_extent_len;
btrfs_release_path(path);
}
@@ -3956,7 +3947,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4059,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
@@ -4115,7 +4107,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4280,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4309,20 +4302,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
int *extent_mirror_num)
{
u64 mapped_length;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bbio, 0);
- if (ret || !bbio || mapped_length < extent_len ||
- !bbio->stripes[0].dev->bdev) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
return;
}
- *extent_physical = bbio->stripes[0].physical;
- *extent_mirror_num = bbio->mirror_num;
- *extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 72f9b865e847..7d1642937274 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,6 +24,7 @@
#include "transaction.h"
#include "compression.h"
#include "xattr.h"
+#include "print-tree.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -84,6 +85,8 @@ struct send_ctx {
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
@@ -96,6 +99,15 @@ struct send_ctx {
struct btrfs_key *cmp_key;
/*
+ * Keep track of the generation of the last transaction that was used
+ * for relocating a block group. This is periodically checked in order
+ * to detect if a relocation happened since the last check, so that we
+ * don't operate on stale extent buffers for nodes (level >= 1) or on
+ * stale disk_bytenr values of file extent items.
+ */
+ u64 last_reloc_trans;
+
+ /*
* infos of the currently processed inode. In case of deleted inodes,
* these are the values from the deleted inode.
*/
@@ -312,6 +324,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
sctx->parent_root->root_key.objectid : 0));
}
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ default: return false;
+ }
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -506,17 +528,12 @@ out:
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
- int ret;
-
p->reversed = from->reversed;
fs_path_reset(p);
- ret = fs_path_add_path(p, from);
-
- return ret;
+ return fs_path_add_path(p, from);
}
-
static void fs_path_unreverse(struct fs_path *p)
{
char *tmp;
@@ -886,7 +903,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
struct btrfs_path *tmp_path;
@@ -918,12 +934,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
if (found_key->type == BTRFS_INODE_REF_KEY) {
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
struct btrfs_inode_ref);
- item = btrfs_item_nr(slot);
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*iref);
} else {
ptr = btrfs_item_ptr_offset(eb, slot);
- total = btrfs_item_size_nr(eb, slot);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*extref);
}
@@ -992,7 +1007,7 @@ out:
typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx);
+ void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_dir_item.
@@ -1006,7 +1021,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
{
int ret = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
@@ -1018,7 +1032,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
u32 total;
int slot;
int num;
- u8 type;
/*
* Start with a small buffer (1 page). If later we end up needing more
@@ -1035,20 +1048,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
eb = path->nodes[0];
slot = path->slots[0];
- item = btrfs_item_nr(slot);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
cur = 0;
len = 0;
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
num = 0;
while (cur < total) {
name_len = btrfs_dir_name_len(eb, di);
data_len = btrfs_dir_data_len(eb, di);
- type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (type == BTRFS_FT_XATTR) {
+ if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1098,7 +1109,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
cur += len;
ret = iterate(num, &di_key, buf, name_len, buf + name_len,
- data_len, type, ctx);
+ data_len, ctx);
if (ret < 0)
goto out;
if (ret) {
@@ -1415,6 +1426,26 @@ static int find_extent_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ /*
+ * A transaction commit for a transaction in which block group
+ * relocation was done just happened.
+ * The disk_bytenr of the file extent item we processed is
+ * possibly stale, referring to the extent's location before
+ * relocation. So act as if we haven't found any clone sources
+ * and fallback to write commands, which will read the correct
+ * data from the new extent location. Otherwise we will fail
+ * below because we haven't found our own back reference or we
+ * could be getting incorrect sources in case the old extent
+ * was already reallocated after the relocation.
+ */
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOENT;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
@@ -1680,8 +1711,7 @@ out:
*/
static int lookup_dir_item_inode(struct btrfs_root *root,
u64 dir, const char *name, int name_len,
- u64 *found_inode,
- u8 *found_type)
+ u64 *found_inode)
{
int ret = 0;
struct btrfs_dir_item *di;
@@ -1704,7 +1734,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
*found_inode = key.objectid;
- *found_type = btrfs_dir_type(path->nodes[0], di);
out:
btrfs_free_path(path);
@@ -1827,7 +1856,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
- u8 other_type = 0;
if (!sctx->parent_root)
goto out;
@@ -1855,7 +1883,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
}
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
- &other_inode, &other_type);
+ &other_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1900,7 +1928,6 @@ static int did_overwrite_ref(struct send_ctx *sctx,
int ret = 0;
u64 gen;
u64 ow_inode;
- u8 other_type;
if (!sctx->parent_root)
goto out;
@@ -1924,7 +1951,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
- &ow_inode, &other_type);
+ &ow_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -2720,19 +2747,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
if (S_ISDIR(sctx->cur_inode_mode)) {
ret = did_create_dir(sctx, sctx->cur_ino);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ else if (ret > 0)
+ return 0;
}
- ret = send_create_inode(sctx, sctx->cur_ino);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
+ return send_create_inode(sctx, sctx->cur_ino);
}
struct recorded_ref {
@@ -3617,7 +3637,7 @@ static int is_ancestor(struct btrfs_root *root,
key.type != BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
while (cur_offset < item_size) {
u64 parent;
u64 parent_gen;
@@ -4646,9 +4666,8 @@ out:
}
static int __process_new_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *name, int name_len, const char *data,
+ int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4692,8 +4711,7 @@ out:
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *data, int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4738,10 +4756,8 @@ struct find_xattr_ctx {
int found_data_len;
};
-static int __find_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *vctx)
+static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
+ int name_len, const char *data, int data_len, void *vctx)
{
struct find_xattr_ctx *ctx = vctx;
@@ -4791,7 +4807,7 @@ static int find_xattr(struct btrfs_root *root,
static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4803,12 +4819,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
&found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
} else if (ret >= 0) {
if (data_len != found_data_len ||
memcmp(data, found_data, data_len)) {
ret = __process_new_xattr(num, di_key, name, name_len,
- data, data_len, type, ctx);
+ data, data_len, ctx);
} else {
ret = 0;
}
@@ -4821,7 +4837,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4830,7 +4846,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
else if (ret >= 0)
ret = 0;
@@ -4978,6 +4994,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
@@ -6561,7 +6581,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *)(ptr +
@@ -6592,6 +6612,50 @@ static int changed_cb(struct btrfs_path *left_path,
{
int ret = 0;
+ /*
+ * We can not hold the commit root semaphore here. This is because in
+ * the case of sending and receiving to the same filesystem, using a
+ * pipe, could result in a deadlock:
+ *
+ * 1) The task running send blocks on the pipe because it's full;
+ *
+ * 2) The task running receive, which is the only consumer of the pipe,
+ * is waiting for a transaction commit (for example due to a space
+ * reservation when doing a write or triggering a transaction commit
+ * when creating a subvolume);
+ *
+ * 3) The transaction is waiting to write lock the commit root semaphore,
+ * but can not acquire it since it's being held at 1).
+ *
+ * Down this call chain we write to the pipe through kernel_write().
+ * The same type of problem can also happen when sending to a file that
+ * is stored in the same filesystem - when reserving space for a write
+ * into the file, we can trigger a transaction commit.
+ *
+ * Our caller has supplied us with clones of leaves from the send and
+ * parent roots, so we're safe here from a concurrent relocation and
+ * further reallocation of metadata extents while we are here. Below we
+ * also assert that the leaves are clones.
+ */
+ lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+ /*
+ * We always have a send root, so left_path is never NULL. We will not
+ * have a leaf when we have reached the end of the send root but have
+ * not yet reached the end of the parent root.
+ */
+ if (left_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &left_path->nodes[0]->bflags));
+ /*
+ * When doing a full send we don't have a parent root, so right_path is
+ * NULL. When doing an incremental send, we may have reached the end of
+ * the parent root already, so we don't have a leaf at right_path.
+ */
+ if (right_path && right_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &right_path->nodes[0]->bflags));
+
if (result == BTRFS_COMPARE_TREE_SAME) {
if (key->type == BTRFS_INODE_REF_KEY ||
key->type == BTRFS_INODE_EXTREF_KEY) {
@@ -6638,14 +6702,46 @@ out:
return ret;
}
+static int search_key_again(const struct send_ctx *sctx,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ int ret;
+
+ if (!path->need_commit_sem)
+ lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+ /*
+ * Roots used for send operations are readonly and no one can add,
+ * update or remove keys from them, so we should be able to find our
+ * key again. The only exception is deduplication, which can operate on
+ * readonly roots and add, update or remove keys to/from them - but at
+ * the moment we don't allow it to run in parallel with send.
+ */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ ASSERT(ret <= 0);
+ if (ret > 0) {
+ btrfs_print_tree(path->nodes[path->lowest_level], false);
+ btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+ key->objectid, key->type, key->offset,
+ (root == sctx->parent_root ? "parent" : "send"),
+ root->root_key.objectid, path->lowest_level,
+ path->slots[path->lowest_level]);
+ return -EUCLEAN;
+ }
+
+ return ret;
+}
+
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -6656,6 +6752,10 @@ static int full_send_tree(struct send_ctx *sctx)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
+ down_read(&fs_info->commit_root_sem);
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -6663,15 +6763,35 @@ static int full_send_tree(struct send_ctx *sctx)
goto out_finish;
while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &key, slot);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * A transaction used for relocating a block group was
+ * committed or is about to finish its commit. Release
+ * our path (leaf) and restart the search, so that we
+ * avoid operating on any file extent items that are
+ * stale, with a disk_bytenr that reflects a pre
+ * relocation value. This way we avoid as much as
+ * possible to fallback to regular writes when checking
+ * if we can clone file ranges.
+ */
+ btrfs_release_path(path);
+ ret = search_key_again(sctx, send_root, path, &key);
+ if (ret < 0)
+ goto out;
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
ret = btrfs_next_item(send_root, path);
if (ret < 0)
goto out;
@@ -6689,6 +6809,20 @@ out:
return ret;
}
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+ struct extent_buffer *clone;
+
+ clone = btrfs_clone_extent_buffer(path->nodes[level]);
+ if (!clone)
+ return -ENOMEM;
+
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = clone;
+
+ return 0;
+}
+
static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
@@ -6698,6 +6832,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
u64 reada_max;
u64 reada_done = 0;
+ lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
BUG_ON(*level == 0);
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
@@ -6721,6 +6857,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
+
+ if (*level == 0)
+ return replace_node_with_clone(path, 0);
+
return 0;
}
@@ -6734,8 +6874,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
path->slots[*level]++;
while (path->slots[*level] >= nritems) {
- if (*level == root_level)
+ if (*level == root_level) {
+ path->slots[*level] = nritems - 1;
return -1;
+ }
/* move upnext */
path->slots[*level] = 0;
@@ -6767,14 +6909,20 @@ static int tree_advance(struct btrfs_path *path,
} else {
ret = tree_move_down(path, level, reada_min_gen);
}
- if (ret >= 0) {
- if (*level == 0)
- btrfs_item_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- else
- btrfs_node_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- }
+
+ /*
+ * Even if we have reached the end of a tree, ret is -1, update the key
+ * anyway, so that in case we need to restart due to a block group
+ * relocation, we can assert that the last key of the root node still
+ * exists in the tree.
+ */
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+
return ret;
}
@@ -6786,8 +6934,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
int len1, len2;
unsigned long off1, off2;
- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
if (len1 != len2)
return 1;
@@ -6804,6 +6952,97 @@ static int tree_compare_item(struct btrfs_path *left_path,
}
/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ * any locks on them, which does not prevent them from having been relocated
+ * and reallocated after the last time we released the commit root semaphore.
+ * The exception are the root nodes, for which we always have a clone, see
+ * the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ * we are safe from the concurrent relocation and reallocation. However they
+ * can have file extent items with a pre relocation disk_bytenr value, so we
+ * restart the start from the current commit roots and clone the new leaves so
+ * that we get the post relocation disk_bytenr values. Not doing so, could
+ * make us clone the wrong data in case there are new extents using the old
+ * disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ const struct btrfs_key *left_key,
+ const struct btrfs_key *right_key,
+ int left_level,
+ int right_level,
+ const struct send_ctx *sctx)
+{
+ int root_level;
+ int ret;
+
+ lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ /*
+ * Since keys can not be added or removed to/from our roots because they
+ * are readonly and we do not allow deduplication to run in parallel
+ * (which can add, remove or change keys), the layout of the trees should
+ * not change.
+ */
+ left_path->lowest_level = left_level;
+ ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+ if (ret < 0)
+ return ret;
+
+ right_path->lowest_level = right_level;
+ ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If the lowest level nodes are leaves, clone them so that they can be
+ * safely used by changed_cb() while not under the protection of the
+ * commit root semaphore, even if relocation and reallocation happens in
+ * parallel.
+ */
+ if (left_level == 0) {
+ ret = replace_node_with_clone(left_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (right_level == 0) {
+ ret = replace_node_with_clone(right_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * Now clone the root nodes (unless they happen to be the leaves we have
+ * already cloned). This is to protect against concurrent snapshotting of
+ * the send and parent roots (see the comment at btrfs_compare_trees()).
+ */
+ root_level = btrfs_header_level(sctx->send_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(left_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ root_level = btrfs_header_level(sctx->parent_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(right_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* This function compares two trees and calls the provided callback for
* every changed/new/deleted item it finds.
* If shared tree blocks are encountered, whole subtrees are skipped, making
@@ -6831,10 +7070,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
int right_root_level;
int left_level;
int right_level;
- int left_end_reached;
- int right_end_reached;
- int advance_left;
- int advance_right;
+ int left_end_reached = 0;
+ int right_end_reached = 0;
+ int advance_left = 0;
+ int advance_right = 0;
u64 left_blockptr;
u64 right_blockptr;
u64 left_gen;
@@ -6902,12 +7141,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
down_read(&fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
+ /*
+ * We clone the root node of the send and parent roots to prevent races
+ * with snapshot creation of these roots. Snapshot creation COWs the
+ * root node of a tree, so after the transaction is committed the old
+ * extent can be reallocated while this send operation is still ongoing.
+ * So we clone them, under the commit root semaphore, to be race free.
+ */
left_path->nodes[left_level] =
btrfs_clone_extent_buffer(left_root->commit_root);
if (!left_path->nodes[left_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
right_level = btrfs_header_level(right_root->commit_root);
@@ -6915,9 +7160,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->nodes[right_level] =
btrfs_clone_extent_buffer(right_root->commit_root);
if (!right_path->nodes[right_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
/*
* Our right root is the parent root, while the left root is the "send"
@@ -6927,7 +7171,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
* will need to read them at some point.
*/
reada_min_gen = btrfs_header_generation(right_root->commit_root);
- up_read(&fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -6942,11 +7185,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
btrfs_node_key_to_cpu(right_path->nodes[right_level],
&right_key, right_path->slots[right_level]);
- left_end_reached = right_end_reached = 0;
- advance_left = advance_right = 0;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
while (1) {
- cond_resched();
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ up_read(&fs_info->commit_root_sem);
+ cond_resched();
+ down_read(&fs_info->commit_root_sem);
+ }
+
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ ret = restart_after_relocation(left_path, right_path,
+ &left_key, &right_key,
+ left_level, right_level,
+ sctx);
+ if (ret < 0)
+ goto out_unlock;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ }
+
if (advance_left && !left_end_reached) {
ret = tree_advance(left_path, &left_level,
left_root_level,
@@ -6955,7 +7213,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_left = 0;
}
if (advance_right && !right_end_reached) {
@@ -6966,54 +7224,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_right = 0;
}
if (left_end_reached && right_end_reached) {
ret = 0;
- goto out;
+ goto out_unlock;
} else if (left_end_reached) {
if (right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_right = ADVANCE;
continue;
} else if (right_end_reached) {
if (left_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_left = ADVANCE;
continue;
}
if (left_level == 0 && right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
sctx);
- if (ret < 0)
- goto out;
advance_left = ADVANCE;
} else if (cmp > 0) {
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
sctx);
- if (ret < 0)
- goto out;
advance_right = ADVANCE;
} else {
enum btrfs_compare_tree_result result;
@@ -7027,11 +7286,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_path, right_path,
&left_key, result, sctx);
- if (ret < 0)
- goto out;
advance_left = ADVANCE;
advance_right = ADVANCE;
}
+
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
} else if (left_level == right_level) {
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
@@ -7071,6 +7332,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
}
}
+out_unlock:
+ up_read(&fs_info->commit_root_sem);
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
@@ -7209,10 +7472,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
root->root_key.objectid, root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
+ struct btrfs_root *send_root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
@@ -7276,6 +7539,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
ret = -EBADF;
@@ -7409,21 +7683,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
- spin_lock(&fs_info->send_reloc_lock);
- if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
- spin_unlock(&fs_info->send_reloc_lock);
- btrfs_warn_rl(fs_info,
- "cannot run send because a relocation operation is in progress");
- ret = -EAGAIN;
- goto out;
- }
- fs_info->send_in_progress++;
- spin_unlock(&fs_info->send_reloc_lock);
-
ret = send_subvol(sctx);
- spin_lock(&fs_info->send_reloc_lock);
- fs_info->send_in_progress--;
- spin_unlock(&fs_info->send_reloc_lock);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index de91488b7cd0..08602fdd600a 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
enum btrfs_send_cmd {
BTRFS_SEND_C_UNSPEC,
+ /* Version 1 */
BTRFS_SEND_C_SUBVOL,
BTRFS_SEND_C_SNAPSHOT,
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_END,
BTRFS_SEND_C_UPDATE_EXTENT,
+ __BTRFS_SEND_C_MAX_V1,
+
+ /* Version 2 */
+ __BTRFS_SEND_C_MAX_V2,
+
+ /* End */
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
@@ -119,7 +126,7 @@ enum {
#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
#ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index aa5be0b24987..b87931a458eb 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
enum btrfs_flush_state state, bool for_preempt)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -737,6 +737,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
+ lockdep_assert_held(&space_info->lock);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -844,6 +846,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
+ if (!ticket->steal)
+ return false;
+
if (global_rsv->space_info != space_info)
return false;
@@ -885,6 +890,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
@@ -898,16 +904,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (ticket->steal &&
- steal_from_global_rsv(fs_info, space_info, ticket))
+ if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
remove_ticket(space_info, ticket);
- ticket->error = -ENOSPC;
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
@@ -916,7 +924,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in
* the list.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
@@ -1054,7 +1063,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv->reserved;
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
- spin_unlock(&space_info->lock);
/*
* We don't want to include the global_rsv in our calculation,
@@ -1085,6 +1093,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
+ spin_unlock(&space_info->lock);
+
/*
* We don't want to reclaim everything, just a portion, so scale
* down the to_reclaim by 1/4. If it takes us down to 0,
@@ -1172,6 +1182,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
@@ -1202,9 +1216,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
} else {
flush_state = 0;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
}
spin_unlock(&space_info->lock);
}
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
}
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
@@ -1240,18 +1265,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int states_nr)
{
u64 to_reclaim;
- int flush_state;
+ int flush_state = 0;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
- if (!to_reclaim) {
+ /*
+ * This is the priority reclaim path, so to_reclaim could be >0 still
+ * because we may have only satisified the priority tickets and still
+ * left non priority tickets on the list. We would then have
+ * to_reclaim but ->bytes == 0.
+ */
+ if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- flush_state = 0;
- do {
+ while (flush_state < states_nr) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, to_reclaim, states[flush_state],
false);
flush_state++;
@@ -1260,23 +1290,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- } while (flush_state < states_nr);
+ }
+
+ /* Attempt to steal from the global rsv if we can. */
+ if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ }
+
+ /*
+ * We must run try_granting_tickets here because we could be a large
+ * ticket in front of a smaller ticket that can now be satisfied with
+ * the available space.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
+ spin_lock(&space_info->lock);
+
+ /* We could have been granted before we got here. */
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
while (!space_info->full) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
}
+
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
@@ -1358,25 +1414,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
break;
}
- spin_lock(&space_info->lock);
ret = ticket->error;
- if (ticket->bytes || ticket->error) {
- /*
- * We were a priority ticket, so we need to delete ourselves
- * from the list. Because we could have other priority tickets
- * behind us that require less space, run
- * btrfs_try_granting_tickets() to see if their reservations can
- * now be made.
- */
- if (!list_empty(&ticket->list)) {
- remove_ticket(space_info, ticket);
- btrfs_try_granting_tickets(fs_info, space_info);
- }
-
- if (!ret)
- ret = -ENOSPC;
- }
- spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
/*
* Check that we can't have an error set if the reservation succeeded,
@@ -1418,6 +1456,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
space_info->clamp = min(space_info->clamp + 1, 8);
}
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
/**
* Try to reserve bytes from the block_rsv's space
*
@@ -1491,7 +1535,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
- ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ ticket.steal = can_steal(flush);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
@@ -1547,7 +1591,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
/**
* Trye to reserve metadata bytes from the block_rsv's space
*
- * @root: the root we're allocating for
+ * @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1559,22 +1603,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
- if (ret == -ENOSPC &&
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- if (block_rsv != global_rsv &&
- !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
- ret = 0;
- }
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index cb5056472e79..d841fed73492 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index cb10e56ee31e..ef7ae20d2b77 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,11 +63,41 @@
* This means a slightly higher tree locking latency.
*/
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
- struct btrfs_subpage *subpage = NULL;
- int ret;
+ struct btrfs_subpage *subpage;
/*
* We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
*/
if (page->mapping)
ASSERT(PageLocked(page));
+
/* Either not subpage, or the page already has private attached */
if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
return 0;
- ret = btrfs_alloc_subpage(fs_info, &subpage, type);
- if (ret < 0)
- return ret;
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
attach_page_private(page, subpage);
return 0;
}
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
btrfs_free_subpage(subpage);
}
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
{
- if (fs_info->sectorsize == PAGE_SIZE)
- return 0;
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
- *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
- if (!*ret)
- return -ENOMEM;
- spin_lock_init(&(*ret)->lock);
+ spin_lock_init(&ret->lock);
if (type == BTRFS_SUBPAGE_METADATA) {
- atomic_set(&(*ret)->eb_refs, 0);
+ atomic_set(&ret->eb_refs, 0);
} else {
- atomic_set(&(*ret)->readers, 0);
- atomic_set(&(*ret)->writers, 0);
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
}
- return 0;
+ return ret;
}
void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
u32 orig_len = *len;
*start = max_t(u64, page_offset(page), orig_start);
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, page, start, len);
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
ASSERT(atomic_read(&subpage->writers) >= nbits);
return atomic_sub_and_test(nbits, &subpage->writers);
}
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
unlock_page(page);
}
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
{
- const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
- const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned int found_zero;
- btrfs_subpage_assert(fs_info, page, start, len);
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
- /*
- * Here nbits can be 16, thus can go beyond u16 range. We make the
- * first left shift to be calculate in unsigned long (at least u32),
- * then truncate the result to u16.
- */
- return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
}
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap |= tmp;
- if (subpage->uptodate_bitmap == U16_MAX)
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
SetPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap &= ~tmp;
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
ClearPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap &= ~tmp;
- if (subpage->error_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
ClearPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
set_page_dirty(page);
}
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
bool last = false;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap &= ~tmp;
- if (subpage->dirty_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
set_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0) {
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
ASSERT(PageWriteback(page));
end_page_writeback(page);
}
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap &= ~tmp;
- if (subpage->ordered_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
ClearPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
unsigned long flags; \
bool ret; \
\
spin_lock_irqsave(&subpage->lock, flags); \
- ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
spin_unlock_irqrestore(&subpage->lock, flags); \
return ret; \
}
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage->dirty_bitmap == 0);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages() or extent_write_full_page().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For regular page size case, we just unlock the page */
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 0120948f37a1..7accb5c40d33 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,38 @@
#include <linux/spinlock.h>
/*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
*/
-#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
/*
* Structure to trace status of each sector inside a page, attached to
@@ -18,10 +46,6 @@
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- u16 uptodate_bitmap;
- u16 error_bitmap;
- u16 dirty_bitmap;
- u16 writeback_bitmap;
/*
* Both data and metadata needs to track how many readers are for the
* page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
* manages whether the subpage can be detached.
*/
atomic_t eb_refs;
- /* Structures only used by data */
- struct {
- atomic_t writers;
- /* Tracke pending ordered extent in this sector */
- u16 ordered_bitmap;
- };
+ /* Structures only used by data */
+ atomic_t writers;
};
+ unsigned long bitmaps[];
};
enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page);
/* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 537d90bf5d84..b228efe8ab6e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -23,7 +23,6 @@
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <linux/cleancache.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
@@ -67,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
@@ -129,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
@@ -141,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
@@ -149,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.fmt = fmt;
vaf.va = &args;
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, function, line, errno, errstr, &vaf);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
- sb->s_id, function, line, errno, errstr);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
}
#endif
@@ -241,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.va = &args;
if (__ratelimit(ratelimit)) {
- if (fs_info)
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info->sb->s_id, &vaf);
- else
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
}
va_end(args);
@@ -862,6 +913,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_space_cache:
case Opt_space_cache_version:
+ /*
+ * We already set FREE_SPACE_TREE above because we have
+ * compat_ro(FREE_SPACE_TREE) set, and we aren't going
+ * to allow v1 to be set for extent tree v2, simply
+ * ignore this setting if we're extent tree v2.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(info->mount_opt,
@@ -882,6 +941,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
+ /*
+ * We cannot operate without the free space tree with
+ * extent tree v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (btrfs_test_opt(info, SPACE_CACHE)) {
btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
@@ -897,6 +962,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
+ /*
+ * We cannot clear the free space tree with extent tree
+ * v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
@@ -1374,7 +1445,6 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}
- cleancache_init_fs(sb);
sb->s_flags |= SB_ACTIVE;
return 0;
@@ -1705,7 +1775,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_close_devices;
}
- bdev = fs_devices->latest_bdev;
+ bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
@@ -1842,7 +1912,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size);
}
@@ -2006,7 +2075,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -2386,6 +2455,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
+ dev_t devt = 0;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
@@ -2405,7 +2475,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
- ret = btrfs_forget_devices(vol->name);
+ if (vol->name[0] != 0) {
+ ret = lookup_bdev(vol->name, &devt);
+ if (ret)
+ break;
+ }
+ ret = btrfs_forget_devices(devt);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
@@ -2463,30 +2538,16 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_device *dev, *first_dev = NULL;
/*
- * Lightweight locking of the devices. We should not need
- * device_list_mutex here as we only read the device data and the list
- * is protected by RCU. Even if a device is deleted during the list
- * traversals, we'll get valid data, the freeing callback will wait at
- * least until the rcu_read_unlock.
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
*/
rcu_read_lock();
- list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
-
- if (first_dev)
- seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
- else
- WARN_ON(1);
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
+
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25a6f587852b..ba78ca5aabbb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-/* Remove once support for zoned allocation is feature complete */
#ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for zoned allocation is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid1c34),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(zoned),
+ BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -330,7 +333,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -345,12 +348,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +361,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
static ssize_t send_stream_version_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
@@ -378,9 +381,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
int i;
for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i ? " " : ""), rescue_opts[i]);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +396,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
/* 4K sector size is also supported with 64K page size */
if (PAGE_SIZE == SZ_64K)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
/* Only sectorsize == PAGE_SIZE is now supported */
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
}
@@ -437,7 +439,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +450,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +461,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_bitmap_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -470,7 +472,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +483,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_extent_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -492,8 +494,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.iops_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +525,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.kbps_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +555,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(fs_info->discard_ctl.max_discard_size));
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +629,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +675,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
/*
@@ -771,7 +773,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -819,7 +821,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +831,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +841,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +853,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return sysfs_emit(buf, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +891,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%pU\n",
- fs_info->fs_devices->metadata_uuid);
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +902,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
case BTRFS_EXCLOP_BALANCE:
str = "balance\n";
break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
case BTRFS_EXCLOP_DEV_ADD:
str = "device add\n";
break;
@@ -941,7 +944,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
str = "UNKNOWN\n";
break;
}
- return scnprintf(buf, PAGE_SIZE, "%s", str);
+ return sysfs_emit(buf, "%s", str);
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
@@ -950,7 +953,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
}
BTRFS_ATTR(, generation, btrfs_generation_show);
@@ -1028,8 +1031,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n",
- READ_ONCE(fs_info->bg_reclaim_threshold));
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -1108,6 +1110,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
+ ARRAY_SIZE(btrfs_feature_attrs));
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
@@ -1276,11 +1283,6 @@ static void init_feature_attrs(void)
struct btrfs_feature_attr *fa;
int set, i;
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
- ARRAY_SIZE(btrfs_feature_attrs));
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
- ARRAY_SIZE(btrfs_feature_attrs[0]));
-
memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
memset(btrfs_unknown_feature_names, 0,
sizeof(btrfs_unknown_feature_names));
@@ -1471,7 +1473,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
@@ -1484,7 +1486,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
@@ -1498,7 +1500,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@@ -1509,8 +1511,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(device->scrub_speed_max));
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
}
static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,10 +1539,20 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid);
+}
+BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show);
+
static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1549,14 +1560,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
devid_kobj);
if (!device->dev_stats_valid)
- return scnprintf(buf, PAGE_SIZE, "invalid\n");
+ return sysfs_emit(buf, "invalid\n");
/*
* Print all at once so we get a snapshot of all values from the same
* time. Keep them in sync and in order of definition of
* btrfs_dev_stat_values.
*/
- return scnprintf(buf, PAGE_SIZE,
+ return sysfs_emit(buf,
"write_errs %d\n"
"read_errs %d\n"
"flush_errs %d\n"
@@ -1577,6 +1588,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
*/
static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, error_stats),
+ BTRFS_ATTR_PTR(devid, fsid),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 3a4099a2bf05..d8e56edd6991 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
+ btrfs_global_root_delete(root);
btrfs_put_root(root);
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index df54cdfdc250..51a8b075c259 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
struct btrfs_path *path = NULL;
struct btrfs_root *root = NULL;
struct extent_buffer *eb;
- struct btrfs_item *item;
char *value = "mary had a little lamb";
char *split1 = "mary had a little";
char *split2 = " lamb";
@@ -60,8 +59,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, 1);
- item = btrfs_item_nr(0);
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split1)) {
+ if (btrfs_item_size(eb, 0) != strlen(split1)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 1) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split3)) {
+ if (btrfs_item_size(eb, 0) != strlen(split3)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split4)) {
+ if (btrfs_item_size(eb, 1) != strlen(split4)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(2);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 2) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 73e96d505f4f..a232b15b8021 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
return count;
}
+#define STATE_FLAG_STR_LEN 256
+
+#define PRINT_ONE_FLAG(state, dest, cur, name) \
+({ \
+ if (state->state & EXTENT_##name) \
+ cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \
+ "%s" #name, cur == 0 ? "" : "|"); \
+})
+
+static void extent_flag_to_str(const struct extent_state *state, char *dest)
+{
+ int cur = 0;
+
+ dest[0] = 0;
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY);
+ PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
+ PRINT_ONE_FLAG(state, dest, cur, LOCKED);
+ PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
+ PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
+ PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
+ PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
+ PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
+ PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
+ PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
+ PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
+}
+
+static void dump_extent_io_tree(const struct extent_io_tree *tree)
+{
+ struct rb_node *node;
+ char flags_str[STATE_FLAG_STR_LEN];
+
+ node = rb_first(&tree->state);
+ test_msg("io tree content:");
+ while (node) {
+ struct extent_state *state;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ extent_flag_to_str(state, flags_str);
+ test_msg(" start=%llu len=%llu flags=%s", state->start,
+ state->end + 1 - state->start, flags_str);
+ node = rb_next(node);
+ }
+}
+
static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
@@ -112,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -143,7 +191,7 @@ static int test_find_delalloc(u32 sectorsize)
}
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -177,14 +225,14 @@ static int test_find_delalloc(u32 sectorsize)
goto out_bits;
}
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
}
- if (end != (u64)-1) {
+ if (end != test_start + PAGE_SIZE - 1) {
test_err("did not return the proper end offset");
goto out_bits;
}
@@ -198,7 +246,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -233,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize)
/* We unlocked it in the previous test */
lock_page(locked_page);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
/*
* Currently if we fail to find dirty pages in the delalloc range we
* will adjust max_bytes down to PAGE_SIZE and then re-search. If
@@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize)
}
ret = 0;
out_bits:
+ if (ret)
+ dump_extent_io_tree(tmp);
clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
@@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void)
ret = 0;
out:
+ if (ret)
+ dump_extent_io_tree(&tree);
clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
return ret;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 319fed82d741..c5b3a631bf4f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
+ write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
@@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#endif
free_extent_map(em);
}
+ write_unlock(&em_tree->lock);
}
/*
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8f05c1eb833f..5930cdcae5cb 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return 0;
}
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .use_bitmap = bytes_index_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 offset, max_extent_size, bytes;
+ int ret, i;
+
+ test_msg("running bytes index tests");
+
+ /* First just validate that it does everything in order. */
+ offset = 0;
+ for (i = 0; i < 10; i++) {
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+ offset += bytes + sectorsize;
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps do the correct thing. */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ for (i = 0; i < 2; i++) {
+ offset = i * BITS_PER_BITMAP * sectorsize;
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps with different ->max_extent_size. */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
+
+ ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+
+ offset = BITS_PER_BITMAP * sectorsize;
+ ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap_entry");
+ return ret;
+ }
+
+ /*
+ * Now set a bunch of sectorsize extents in the first entry so it's
+ * ->bytes is large.
+ */
+ for (i = 2; i < 20; i += 2) {
+ offset = sectorsize * i;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error populating sparse bitmap %d", ret);
+ return ret;
+ }
+ }
+
+ /*
+ * Now set a contiguous extent in the second bitmap so its
+ * ->max_extent_size is larger than the first bitmaps.
+ */
+ offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding contiguous extent %d", ret);
+ return ret;
+ }
+
+ /*
+ * Since we don't set ->max_extent_size unless we search everything
+ * should be indexed on bytes.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (10 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+ 0, &max_extent_size);
+ if (offset != 0) {
+ test_err("found space to alloc even though we don't have enough space");
+ return -EINVAL;
+ }
+
+ if (max_extent_size != (2 * sectorsize)) {
+ test_err("got the wrong max_extent size %llu expected %llu",
+ max_extent_size, (unsigned long long)(2 * sectorsize));
+ return -EINVAL;
+ }
+
+ /*
+ * The search should have re-arranged the bytes index to use the
+ * ->max_extent_size, validate it's now what we expect it to be.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (2 * sectorsize)) {
+ test_err("error, the bytes index wasn't recalculated properly");
+ return -EINVAL;
+ }
+
+ /* Add another sectorsize to re-arrange the tree back to ->bytes. */
+ offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding extent to the sparse entry %d", ret);
+ return ret;
+ }
+
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (11 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ /*
+ * Now make sure we find our correct entry after searching that will
+ * result in a re-arranging of the tree.
+ */
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+ 0, &max_extent_size);
+ if (offset != (BITS_PER_BITMAP * sectorsize)) {
+ test_err("error, found %llu instead of %llu for our alloc",
+ offset,
+ (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+ return -EINVAL;
+ }
+
+ cache->free_space_ctl->op = orig_free_space_ops;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ return 0;
+}
+
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
@@ -858,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
}
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
ret = test_extents(cache);
if (ret)
@@ -871,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+ if (ret)
+ goto out;
+ ret = test_bytes_index(cache, sectorsize);
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 2c783d2f5228..13734ed43bfc 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
- root->fs_info->free_space_root = root;
+ root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index c9874b12d337..cac89c388131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 19ba7d5b7d8f..eee1e4459541 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
}
/* We are using this root as our extent root */
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
/*
* Some of the paths we test assume we have a filled out fs_info, so we
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14b9fdc8aaa9..b008c5110958 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *root, *tmp;
struct btrfs_caching_control *caching_ctl, *next;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+
down_write(&fs_info->commit_root_sem);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ fs_info->last_reloc_trans = trans->transid;
+
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
@@ -283,7 +293,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -331,7 +341,7 @@ loop:
*/
kfree(cur_trans);
goto loop;
- } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ } else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
kfree(cur_trans);
return -EROFS;
@@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
- WARN_ON(root == fs_info->extent_root);
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -579,7 +588,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
if (ret)
goto reserve_fail;
if (delayed_refs_bytes) {
@@ -692,7 +701,6 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->root = root;
refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
@@ -846,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state >= min_state);
+ struct btrfs_fs_info *fs_info = commit->fs_info;
+ u64 transid = commit->transid;
+ bool put = false;
+
+ while (1) {
+ wait_event(commit->commit_wait, commit->state >= min_state);
+ if (put)
+ btrfs_put_transaction(commit);
+
+ if (min_state < TRANS_STATE_COMPLETED)
+ break;
+
+ /*
+ * A transaction isn't really completed until all of the
+ * previous transactions are completed, but with fsync we can
+ * end up with SUPER_COMMITTED transactions before a COMPLETED
+ * transaction. Wait for those.
+ */
+
+ spin_lock(&fs_info->trans_lock);
+ commit = list_first_entry_or_null(&fs_info->trans_list,
+ struct btrfs_transaction,
+ list);
+ if (!commit || commit->transid > transid) {
+ spin_unlock(&fs_info->trans_lock);
+ break;
+ }
+ refcount_inc(&commit->use_count);
+ put = true;
+ spin_unlock(&fs_info->trans_lock);
+ }
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -991,8 +1029,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (TRANS_ABORTED(trans) ||
- test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
err = trans->aborted;
@@ -1237,6 +1274,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct extent_buffer *eb;
int ret;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb, BTRFS_NESTING_COW);
@@ -1268,9 +1311,8 @@ again:
root = list_entry(next, struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
- if (root != fs_info->extent_root)
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
@@ -1300,9 +1342,6 @@ again:
if (!list_empty(&fs_info->dirty_cowonly_roots))
goto again;
- list_add_tail(&fs_info->extent_root->dirty_list,
- &trans->transaction->switch_commits);
-
/* Update dev-replace pointer once everything is committed */
fs_info->dev_replace.committed_cursor_left =
fs_info->dev_replace.cursor_left_last_write_of_item;
@@ -1311,6 +1350,32 @@ again:
}
/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We put the drop in progress roots at the front of the list, so if the
+ * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+ * up.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root,
+ root_list);
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_wake_unfinished_drop(fs_info);
+}
+
+/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
@@ -1322,13 +1387,19 @@ void btrfs_add_dead_root(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
- list_add_tail(&root->root_list, &fs_info->dead_roots);
+
+ /* We want to process the partially complete drops first. */
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+ list_add(&root->root_list, &fs_info->dead_roots);
+ else
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
/*
- * update all the cowonly tree roots on disk
+ * Update each subvolume root and its relocation root, if it exists, in the tree
+ * of tree roots. Also free log roots if they exist.
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
@@ -1337,6 +1408,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
int i;
int ret;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
@@ -1349,6 +1426,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *root = gang[i];
int ret2;
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1473,12 +1558,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
- /*
- * We are going to commit transaction, see btrfs_commit_transaction()
- * comment for reason locking tree_log_mutex
- */
- mutex_lock(&fs_info->tree_log_mutex);
-
ret = commit_fs_roots(trans);
if (ret)
goto out;
@@ -1514,8 +1593,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
"Error while writing out transaction for qgroup");
out:
- mutex_unlock(&fs_info->tree_log_mutex);
-
/*
* Force parent root to be updated, as we recorded it before so its
* last_trans == cur_transid.
@@ -1579,7 +1656,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
- pending->error = btrfs_block_rsv_add(root,
+ pending->error = btrfs_block_rsv_add(fs_info,
&pending->block_rsv,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
@@ -1834,6 +1911,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ root_item = &fs_info->block_group_root->root_item;
+
+ super->block_group_root = root_item->bytenr;
+ super->block_group_root_generation = root_item->generation;
+ super->block_group_root_level = root_item->level;
+ }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1862,50 +1947,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
return ret;
}
-/*
- * commit transactions asynchronously. once btrfs_commit_transaction_async
- * returns, any subsequent transaction will not be allowed to join.
- */
-struct btrfs_async_commit {
- struct btrfs_trans_handle *newtrans;
- struct work_struct work;
-};
-
-static void do_async_commit(struct work_struct *work)
-{
- struct btrfs_async_commit *ac =
- container_of(work, struct btrfs_async_commit, work);
-
- /*
- * We've got freeze protection passed with the transaction.
- * Tell lockdep about it.
- */
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
-
- current->journal_info = ac->newtrans;
-
- btrfs_commit_transaction(ac->newtrans);
- kfree(ac);
-}
-
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_async_commit *ac;
struct btrfs_transaction *cur_trans;
- ac = kmalloc(sizeof(*ac), GFP_NOFS);
- if (!ac)
- return -ENOMEM;
-
- INIT_WORK(&ac->work, do_async_commit);
- ac->newtrans = btrfs_join_transaction(trans->root);
- if (IS_ERR(ac->newtrans)) {
- int err = PTR_ERR(ac->newtrans);
- kfree(ac);
- return err;
- }
+ /* Kick the transaction kthread. */
+ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+ wake_up_process(fs_info->transaction_kthread);
/* take transaction reference */
cur_trans = trans->transaction;
@@ -1914,28 +1963,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
btrfs_end_transaction(trans);
/*
- * Tell lockdep we've released the freeze rwsem, since the
- * async commit thread will be the one to unlock it.
- */
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_release(fs_info->sb, SB_FREEZE_FS);
-
- schedule_work(&ac->work);
- /*
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
- if (current->journal_info == trans)
- current->journal_info = NULL;
-
btrfs_put_transaction(cur_trans);
- return 0;
}
-
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1987,7 +2023,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2014,16 +2050,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
- * We use writeback_inodes_sb here because if we used
+ * We use try_to_writeback_inodes_sb() here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
* Currently are holding the fs freeze lock, if we do an async flush
* we'll do btrfs_join_transaction() and deadlock because we need to
* wait for the fs freeze lock. Using the direct flushing we benefit
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
+ *
+ * Note that try_to_writeback_inodes_sb() will only trigger writeback
+ * if it can read lock sb->s_umount. It will always be able to lock it,
+ * except when the filesystem is being unmounted or being frozen, but in
+ * those cases sync_filesystem() is called, which results in calling
+ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
+ * Note that we don't call writeback_inodes_sb() directly, because it
+ * will emit a warning if sb->s_umount is not locked.
*/
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
@@ -2033,6 +2077,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
+/*
+ * Add a pending snapshot associated with the given transaction handle to the
+ * respective handle. This must be called after the transaction commit started
+ * and while holding fs_info->trans_lock.
+ * This serves to guarantee a caller of btrfs_commit_transaction() that it can
+ * safely free the pending snapshot pointer in case btrfs_commit_transaction()
+ * returns an error.
+ */
+static void add_pending_snapshot(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (!trans->pending_snapshot)
+ return;
+
+ lockdep_assert_held(&trans->fs_info->trans_lock);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+
+ list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2106,6 +2171,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+ add_pending_snapshot(trans);
+
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
@@ -2155,7 +2222,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
- if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
goto cleanup_transaction;
}
@@ -2196,11 +2263,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&fs_info->trans_lock);
+ add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
+ /*
+ * We've started the commit, clear the flag in case we were triggered to
+ * do an async commit but somebody else started before the transaction
+ * kthread could do the work.
+ */
+ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
goto scrub_continue;
@@ -2247,24 +2322,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
WARN_ON(cur_trans != trans->transaction);
- /* btrfs_commit_tree_roots is responsible for getting the
- * various roots consistent with each other. Every pointer
- * in the tree of tree roots has to point to the most up to date
- * root for every subvolume and other tree. So, we have to keep
- * the tree logging code from jumping in and changing any
- * of the trees.
- *
- * At this point in the commit, there can't be any tree-log
- * writers, but a little lower down we drop the trans mutex
- * and let new people in. By holding the tree_log_mutex
- * from now until after the super is written, we avoid races
- * with the tree-log code.
- */
- mutex_lock(&fs_info->tree_log_mutex);
-
ret = commit_fs_roots(trans);
if (ret)
- goto unlock_tree_log;
+ goto unlock_reloc;
/*
* Since the transaction is done, we can apply the pending changes
@@ -2283,11 +2343,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
- goto unlock_tree_log;
+ goto unlock_reloc;
ret = commit_cowonly_roots(trans);
if (ret)
- goto unlock_tree_log;
+ goto unlock_reloc;
/*
* The tasks which save the space cache and inode cache may also
@@ -2295,7 +2355,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- goto unlock_tree_log;
+ goto unlock_reloc;
}
cur_trans = fs_info->running_transaction;
@@ -2310,6 +2370,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_set_root_node(&fs_info->block_group_root->root_item,
+ fs_info->block_group_root->node);
+ list_add_tail(&fs_info->block_group_root->dirty_list,
+ &cur_trans->switch_commits);
+ }
+
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
@@ -2328,6 +2395,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_release_chunk_metadata(trans);
+ /*
+ * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
+ * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
+ * make sure that before we commit our superblock, no other task can
+ * start a new transaction and commit a log tree before we commit our
+ * superblock. Anyone trying to commit a log tree locks this mutex before
+ * writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
fs_info->running_transaction = NULL;
@@ -2340,10 +2417,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
- /*
- * reloc_mutex has been unlocked, tree_log_mutex is still held
- * but we can't jump to unlock_tree_log causing double unlock
- */
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2394,7 +2467,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
btrfs_scrub_continue(fs_info);
@@ -2405,8 +2478,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
-unlock_tree_log:
- mutex_unlock(&fs_info->tree_log_mutex);
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
@@ -2434,10 +2505,10 @@ cleanup_transaction:
* because btrfs_commit_super will poke cleaner thread and it will process it a
* few seconds later.
*/
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root;
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ba45065f9451..970ff316069d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -123,6 +123,8 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ /* Set by a task that wants to create a snapshot. */
+ struct btrfs_pending_snapshot *pending_snapshot;
refcount_t use_count;
unsigned int type;
/*
@@ -135,7 +137,6 @@ struct btrfs_trans_handle {
bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
- struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
};
@@ -215,9 +216,10 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7733e8ac0a69..e56c0107eea3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_file_extent_item *fi;
u32 sectorsize = fs_info->sectorsize;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u64 extent_end;
if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
key->offset, sectorsize);
return -EUCLEAN;
}
- if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
+ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
- btrfs_item_size_nr(leaf, slot), csumsize);
+ btrfs_item_size(leaf, slot), csumsize);
return -EUCLEAN;
}
if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
u64 prev_csum_end;
u32 prev_item_size;
- prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_item_size = btrfs_item_size(leaf, slot - 1);
prev_csum_end = (prev_item_size / csumsize) * sectorsize;
prev_csum_end += prev_key->offset;
if (unlikely(prev_csum_end > key->offset)) {
@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u32 cur = 0;
if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
@@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_block_group_item bgi;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 chunk_objectid;
u64 flags;
u64 type;
@@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+ chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ /*
+ * We don't init the nr_global_roots until we load the global
+ * roots, so this could be 0 at mount time. If it's 0 we'll
+ * just assume we're fine, and later we'll check against our
+ * actual value.
+ */
+ if (unlikely(fs_info->nr_global_roots &&
+ chunk_objectid >= fs_info->nr_global_roots)) {
+ block_group_err(leaf, slot,
+ "invalid block group global root id, have %llu, needs to be <= %llu",
+ chunk_objectid,
+ fs_info->nr_global_roots);
+ return -EUCLEAN;
+ }
+ } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -912,10 +929,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
{
int num_stripes;
- if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(struct btrfs_chunk),
BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
@@ -927,10 +944,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
goto out;
if (unlikely(btrfs_chunk_item_size(num_stripes) !=
- btrfs_item_size_nr(leaf, slot))) {
+ btrfs_item_size(leaf, slot))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
@@ -965,6 +982,7 @@ static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_dev_item *ditem;
+ const u32 item_size = btrfs_item_size(leaf, slot);
if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot,
@@ -972,6 +990,13 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
+
+ if (unlikely(item_size != sizeof(*ditem))) {
+ dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*ditem));
+ return -EUCLEAN;
+ }
+
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot,
@@ -1007,6 +1032,7 @@ static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_inode_item *iitem;
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u32 mode;
int ret;
u32 flags;
@@ -1016,6 +1042,12 @@ static int check_inode_item(struct extent_buffer *leaf,
if (unlikely(ret < 0))
return ret;
+ if (unlikely(item_size != sizeof(*iitem))) {
+ generic_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*iitem));
+ return -EUCLEAN;
+ }
+
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
@@ -1095,12 +1127,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (unlikely(ret < 0))
return ret;
- if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
- btrfs_item_size_nr(leaf, slot) !=
+ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size(leaf, slot) !=
btrfs_legacy_root_item_size())) {
generic_err(leaf, slot,
"invalid root item size, have %u expect %zu or %u",
- btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_item_size(leaf, slot), sizeof(ri),
btrfs_legacy_root_item_size());
return -EUCLEAN;
}
@@ -1111,7 +1143,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
* And since we allow geneartion_v2 as 0, it will still pass the check.
*/
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- btrfs_item_size_nr(leaf, slot));
+ btrfs_item_size(leaf, slot));
/* Generation related */
if (unlikely(btrfs_root_generation(&ri) >
@@ -1208,7 +1240,7 @@ static int check_extent_item(struct extent_buffer *leaf,
bool is_tree_block = false;
unsigned long ptr; /* Current pointer inside inline refs */
unsigned long end; /* Extent item end */
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u64 flags;
u64 generation;
u64 total_refs; /* Total refs in btrfs_extent_item */
@@ -1432,10 +1464,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
if (key->type == BTRFS_SHARED_DATA_REF_KEY)
expect_item_size = sizeof(struct btrfs_shared_data_ref);
- if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
+ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
"invalid item size, have %u expect %u for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
expect_item_size, key->type);
return -EUCLEAN;
}
@@ -1460,12 +1492,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
{
struct btrfs_extent_data_ref *dref;
unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
- const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+ const unsigned long end = ptr + btrfs_item_size(leaf, slot);
- if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
+ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
generic_err(leaf, slot,
"invalid item size, have %u expect aligned to %zu for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*dref), key->type);
return -EUCLEAN;
}
@@ -1507,16 +1539,16 @@ static int check_inode_ref(struct extent_buffer *leaf,
if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
- if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
+ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
}
ptr = btrfs_item_ptr_offset(leaf, slot);
- end = ptr + btrfs_item_size_nr(leaf, slot);
+ end = ptr + btrfs_item_size(leaf, slot);
while (ptr < end) {
u16 namelen;
@@ -1633,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* These trees must never be empty */
if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
owner == BTRFS_DEV_TREE_OBJECTID ||
owner == BTRFS_FS_TREE_OBJECTID ||
owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
@@ -1642,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
owner);
return -EUCLEAN;
}
+
/* Unknown tree */
if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
}
+
+ /* EXTENT_TREE_V2 can have empty extent trees. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
return 0;
}
@@ -1667,6 +1711,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
+ u64 item_data_end;
int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1681,6 +1726,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}
+ item_data_end = (u64)btrfs_item_offset(leaf, slot) +
+ btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
@@ -1689,13 +1736,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
if (slot == 0)
item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
else
- item_end_expected = btrfs_item_offset_nr(leaf,
+ item_end_expected = btrfs_item_offset(leaf,
slot - 1);
- if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
+ if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
- "unexpected item end, have %u expect %u",
- btrfs_item_end_nr(leaf, slot),
- item_end_expected);
+ "unexpected item end, have %llu expect %u",
+ item_data_end, item_end_expected);
return -EUCLEAN;
}
@@ -1704,12 +1750,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (unlikely(btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
- "slot end outside of leaf, have %u expect range [0, %u]",
- btrfs_item_end_nr(leaf, slot),
- BTRFS_LEAF_DATA_SIZE(fs_info));
+ "slot end outside of leaf, have %llu expect range [0, %u]",
+ item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 7c45d960b53c..b6cf39f4e7e4 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int next_key_ret = 0;
u64 last_ret = 0;
- if (root->fs_info->extent_root == root) {
- /*
- * there's recursion here right now in the tree locking,
- * we can't defrag the extent root without deadlock
- */
- goto out;
- }
-
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b415c5ec03ea..e65633686378 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,6 +20,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "inode-item.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -94,7 +95,7 @@ enum {
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +208,7 @@ again:
}
atomic_inc(&root->log_writers);
- if (ctx && !ctx->logging_new_name) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -269,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
@@ -293,16 +288,6 @@ struct walk_control {
*/
int free;
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
@@ -353,40 +338,24 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
- if (wc->pin)
+ if (wc->pin) {
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
+ if (ret)
+ return ret;
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, 0) &&
+ btrfs_header_level(eb) == 0)
ret = btrfs_exclude_logged_extents(eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
}
return ret;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -400,18 +369,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
overwrite_root = 1;
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
if (ret == 0) {
char *src_copy;
char *dst_copy;
- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ u32 dst_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (dst_size != item_size)
goto insert;
@@ -505,7 +486,7 @@ insert:
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
- found_size = btrfs_item_size_nr(path->nodes[0],
+ found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
btrfs_truncate_item(path, item_size, 1);
@@ -585,6 +566,36 @@ no_copy:
}
/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
@@ -761,7 +772,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -844,17 +855,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
+ csum_root = btrfs_csum_root(fs_info,
+ sums->bytenr);
if (!ret)
- ret = btrfs_del_csums(trans,
- fs_info->csum_root,
+ ret = btrfs_del_csums(trans, csum_root,
sums->bytenr,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
- fs_info->csum_root, sums);
+ csum_root,
+ sums);
list_del(&sums->list);
kfree(sums);
}
@@ -884,6 +899,26 @@ out:
return ret;
}
+static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const char *name,
+ int name_len)
+{
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ if (ret)
+ return ret;
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ return btrfs_run_delayed_items(trans);
+}
+
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
@@ -893,11 +928,11 @@ out:
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_root *root = dir->root;
struct inode *inode;
char *name;
int name_len;
@@ -926,12 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
name_len);
- if (ret)
- goto out;
- else
- ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -1068,7 +1099,7 @@ again:
* otherwise they must be unlinked as a conflict
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
victim_ref = (struct btrfs_inode_ref *)ptr;
victim_name_len = btrfs_inode_ref_name_len(leaf,
@@ -1091,14 +1122,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
+ ret = unlink_inode_for_log_replay(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- return ret;
*search_done = 1;
goto again;
}
@@ -1127,7 +1155,7 @@ again:
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
@@ -1153,6 +1181,7 @@ again:
parent_objectid, victim_name,
victim_name_len);
if (ret < 0) {
+ kfree(victim_name);
return ret;
} else if (!ret) {
ret = -ENOENT;
@@ -1162,14 +1191,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
victim_name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(
- trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -1192,7 +1218,7 @@ next:
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1204,7 +1230,7 @@ next:
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1289,7 +1315,7 @@ again:
eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
char *name = NULL;
int namelen;
@@ -1324,7 +1350,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1385,10 +1411,11 @@ out:
return ret;
}
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1422,8 +1449,8 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
if (ret)
goto out;
/*
@@ -1432,10 +1459,6 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
*/
if (other_inode->i_nlink == 0)
inc_nlink(other_inode);
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
@@ -1474,7 +1497,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
int ref_struct_size;
ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size(eb, slot);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *r;
@@ -1568,7 +1591,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -1584,7 +1607,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
+ ret = add_link(trans, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
@@ -1648,7 +1671,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
break;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
@@ -1702,7 +1725,7 @@ process_slot:
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
@@ -1920,6 +1943,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_type,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@@ -1945,14 +1996,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
{
char *name;
int name_len;
- struct btrfs_dir_item *dst_di;
- struct btrfs_key found_key;
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
struct btrfs_key log_key;
+ struct btrfs_key search_key;
struct inode *dir;
u8 log_type;
bool exists;
int ret;
- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool update_size = true;
bool name_added = false;
dir = read_one_inode(root, key->objectid);
@@ -1978,76 +2032,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
exists = (ret == 0);
ret = 0;
- if (key->type == BTRFS_DIR_ITEM_KEY) {
- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid,
- key->offset, name,
- name_len, 1);
- } else {
- /* Corruption */
- ret = -EINVAL;
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
goto out;
- }
-
- if (IS_ERR(dst_di)) {
- ret = PTR_ERR(dst_di);
- goto out;
- } else if (!dst_di) {
- /* we need a sequence number to insert, so we only
- * do inserts for the BTRFS_DIR_INDEX_KEY types
- */
- if (key->type != BTRFS_DIR_INDEX_KEY)
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ dir_dst_di, &log_key, log_type,
+ exists);
+ if (ret < 0)
goto out;
- goto insert;
+ dir_dst_matches = (ret == 1);
}
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
- /* the existing item matches the logged item */
- if (found_key.objectid == log_key.objectid &&
- found_key.type == log_key.type &&
- found_key.offset == log_key.offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
- update_size = false;
+ btrfs_release_path(path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid, key->offset,
+ name, name_len, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ index_dst_di, &log_key,
+ log_type, exists);
+ if (ret < 0)
+ goto out;
+ index_dst_matches = (ret == 1);
}
- /*
- * don't drop the conflicting directory entry if the inode
- * for the new entry doesn't exist
- */
- if (!exists)
- goto out;
+ btrfs_release_path(path);
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
- if (ret)
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
goto out;
-
- if (key->type == BTRFS_DIR_INDEX_KEY)
- goto insert;
-out:
- btrfs_release_path(path);
- if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
- kfree(name);
- iput(dir);
- if (!ret && name_added)
- ret = 1;
- return ret;
-insert:
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_REF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -2057,10 +2088,10 @@ insert:
goto out;
}
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_EXTREF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
name_len);
if (ret < 0) {
goto out;
@@ -2079,87 +2110,76 @@ insert:
name_added = true;
update_size = false;
ret = 0;
- goto out;
+
+out:
+ if (!ret && update_size) {
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ }
+ kfree(name);
+ iput(dir);
+ if (!ret && name_added)
+ ret = 1;
+ return ret;
}
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ int ret;
struct btrfs_dir_item *di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- struct btrfs_path *fixup_path = NULL;
-
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret < 0)
- break;
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
- /*
- * If this entry refers to a non-directory (directories can not
- * have a link count > 1) and it was added in the transaction
- * that was not committed, make sure we fixup the link count of
- * the inode it the entry points to. Otherwise something like
- * the following would result in a directory pointing to an
- * inode with a wrong link that does not account for this dir
- * entry:
- *
- * mkdir testdir
- * touch testdir/foo
- * touch testdir/bar
- * sync
- *
- * ln testdir/bar testdir/bar_link
- * ln testdir/foo testdir/foo_link
- * xfs_io -c "fsync" testdir/bar
- *
- * <power failure>
- *
- * mount fs, log replay happens
- *
- * File foo would remain with a link count of 1 when it has two
- * entries pointing to it in the directory testdir. This would
- * make it impossible to ever delete the parent directory has
- * it would result in stale dentries that can never be deleted.
- */
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_key di_key;
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
- if (!fixup_path) {
- fixup_path = btrfs_alloc_path();
- if (!fixup_path) {
- ret = -ENOMEM;
- break;
- }
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ if (ret < 0)
+ return ret;
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path,
- di_key.objectid);
- if (ret)
- break;
- }
- ret = 0;
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_path *fixup_path;
+ struct btrfs_key di_key;
+
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path)
+ return -ENOMEM;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+ btrfs_free_path(fixup_path);
}
- btrfs_free_path(fixup_path);
+
return ret;
}
@@ -2176,7 +2196,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
- u64 dirid, int key_type,
+ u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
@@ -2189,7 +2209,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
return 1;
key.objectid = dirid;
- key.type = key_type;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2203,7 +2223,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
@@ -2230,7 +2250,7 @@ next:
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
@@ -2251,105 +2271,85 @@ out:
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
- u32 item_size;
struct btrfs_dir_item *di;
- struct btrfs_dir_item *log_di;
int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
char *name;
- struct inode *inode;
+ struct inode *inode = NULL;
struct btrfs_key location;
-again:
+ /*
+ * Currenly we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
- goto out;
- }
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
- log_di = NULL;
- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
- log_di = btrfs_lookup_dir_item(trans, log, log_path,
- dir_key->objectid,
- name, name_len, 0);
- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
- log_di = btrfs_lookup_dir_index_item(trans, log,
- log_path,
- dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
- }
- if (!log_di) {
- btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
- btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- kfree(name);
- return -EIO;
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = link_to_fixup_dir(trans, root,
- path, location.objectid);
- if (ret) {
- kfree(name);
- iput(inode);
- goto out;
- }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
- inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
- BTRFS_I(inode), name, name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
- kfree(name);
- iput(inode);
- if (ret)
- goto out;
+ if (log) {
+ struct btrfs_dir_item *log_di;
- /* there might still be more names under this key
- * check and repeat if required
- */
- ret = btrfs_search_slot(NULL, root, dir_key, path,
- 0, 0);
- if (ret == 0)
- goto again;
+ log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
ret = 0;
goto out;
- } else if (IS_ERR(log_di)) {
- kfree(name);
- return PTR_ERR(log_di);
}
- btrfs_release_path(log_path);
- kfree(name);
+ }
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
}
- ret = 0;
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(inode);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, name_len);
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
+ kfree(name);
+ iput(inode);
return ret;
}
@@ -2392,7 +2392,7 @@ process_leaf:
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size_nr(path->nodes[0], i);
+ total_size = btrfs_item_size(path->nodes[0], i);
cur = 0;
while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
@@ -2469,7 +2469,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
{
u64 range_start;
u64 range_end;
- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
@@ -2477,7 +2476,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct inode *dir;
dir_key.objectid = dirid;
- dir_key.type = BTRFS_DIR_ITEM_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
@@ -2491,16 +2490,18 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
btrfs_free_path(log_path);
return 0;
}
-again:
+
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid, key_type,
+ ret = find_dir_range(log, path, dirid,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2523,13 +2524,15 @@ again:
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != dirid ||
- found_key.type != dir_key.type)
- goto next_type;
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
+ ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
@@ -2543,15 +2546,7 @@ again:
break;
range_start = range_end + 1;
}
-
-next_type:
ret = 0;
- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
- key_type = BTRFS_DIR_LOG_INDEX_KEY;
- dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(path);
- goto again;
- }
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
@@ -2711,12 +2706,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
if (ret)
break;
- } else if (key.type == BTRFS_DIR_ITEM_KEY) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
- if (ret)
- break;
}
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
}
btrfs_free_path(path);
return ret;
@@ -2877,6 +2873,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
path->nodes[*level]->len);
if (ret)
return ret;
+ btrfs_redirty_list_add(trans->transaction,
+ next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -2957,6 +2955,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
next->start, next->len);
if (ret)
goto out;
+ btrfs_redirty_list_add(trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -3037,9 +3036,6 @@ static void wait_for_writer(struct btrfs_root *root)
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- if (!ctx)
- return;
-
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -3192,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
goto out;
}
}
@@ -3328,7 +3325,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* writing the super here would result in transid mismatches. If there
* is an error here just bail.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EIO;
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3399,6 +3396,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
if (log->node) {
ret = walk_log_tree(trans, log, &wc);
if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
+
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY | EXTENT_NEW);
+
if (trans)
btrfs_abort_transaction(trans, ret);
else
@@ -3410,8 +3430,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
extent_io_tree_release(&log->log_csum_range);
- if (trans && log->node)
- btrfs_redirty_list_add(trans->transaction, log->node);
btrfs_put_root(log);
}
@@ -3441,32 +3459,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
- * Check if an inode was logged in the current transaction. This may often
- * return some false positives, because logged_trans is an in memory only field,
- * not persisted anywhere. This is meant to be used in contexts where a false
- * positive has no functional consequences.
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
*/
-static bool inode_logged(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static int inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
if (inode->logged_trans == trans->transid)
- return true;
+ return 1;
/*
- * The inode's logged_trans is always 0 when we load it (because it is
- * not persisted in the inode item or elsewhere). So if it is 0, the
- * inode was last modified in the current transaction then the inode may
- * have been logged before in the current transaction, then evicted and
- * loaded again in the current transaction - or may have never been logged
- * in the current transaction, but since we can not be sure, we have to
- * assume it was, otherwise our callers can leave an inconsistent log.
+ * If logged_trans is not 0, then we know the inode logged was not logged
+ * in this transaction, so we can return false right away.
*/
- if (inode->logged_trans == 0 &&
- inode->last_trans == trans->transid &&
- !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
- return true;
+ if (inode->logged_trans > 0)
+ return 0;
+
+ /*
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
- return false;
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transacion's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ inode->logged_trans = trans->transid;
+
+ /*
+ * If it's a directory, then we must set last_dir_index_offset to the
+ * maximum possible value, so that the next attempt to log the inode does
+ * not skip checking if dir index keys found in modified subvolume tree
+ * leaves have been logged before, otherwise it would result in attempts
+ * to insert duplicate dir index keys in the log tree. This must be done
+ * because last_dir_index_offset is an in-memory only field, not persisted
+ * in the inode item or any other on-disk structure, so its value is lost
+ * once the inode is evicted.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_dir_index_offset = (u64)-1;
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const char *name, int name_len,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
+
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_delete_one_dir_name(trans, log, path, di);
}
/*
@@ -3490,113 +3632,74 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
- int err = 0;
- u64 dir_ino = btrfs_ino(dir);
- if (!inode_logged(trans, dir))
- return 0;
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
mutex_lock(&dir->log_mutex);
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_unlock;
}
- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
- name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- if (ret) {
- err = ret;
- goto fail;
- }
- }
- btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- if (ret) {
- err = ret;
- goto fail;
- }
- }
-
- /*
- * We do not need to update the size field of the directory's inode item
- * because on log replay we update the field to reflect all existing
- * entries in the directory (see overwrite_item()).
- */
-fail:
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, name_len, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err == -ENOSPC) {
+ if (ret < 0)
btrfs_set_log_full_commit(trans);
- err = 0;
- } else if (err < 0) {
- btrfs_abort_transaction(trans, err);
- }
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
- if (!inode_logged(trans, inode))
- return 0;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3607,7 +3710,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- int key_type, u64 dirid,
+ u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
@@ -3616,49 +3719,276 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.objectid = dirid;
key.offset = first_offset;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- key.type = BTRFS_DIR_LOG_ITEM_KEY;
- else
- key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src = path->nodes[0];
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ ctx->last_dir_item_offset = key.offset;
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (!ctx->logged_before)
+ goto add_to_batch;
+
+ /*
+ * If we were logged before and have logged dir items, we can skip
+ * checking if any item with a key offset larger than the last one
+ * we logged is in the log tree, saving time and avoiding adding
+ * contention on the log tree. We can only rely on the value of
+ * last_dir_index_offset when we know for sure that the inode was
+ * previously logged in the current transaction.
+ */
+ if (key.offset > inode->last_dir_index_offset)
+ goto add_to_batch;
+ /*
+ * Check if the key was already logged before. If not we can add
+ * it to a batch for bulk insertion.
+ */
+ ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ btrfs_release_path(dst_path);
+ goto add_to_batch;
+ }
+
+ /*
+ * Item exists in the log. Overwrite the item in the log if it
+ * has different content or do nothing if it has exactly the same
+ * content. And then flush the current batch if any - do it after
+ * overwriting the current item, or we would deadlock otherwise,
+ * since we are holding a path for the existing item.
+ */
+ ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+ if (ret < 0)
+ return ret;
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ batch_size = 0;
+ }
+ continue;
+add_to_batch:
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path, int key_type,
+ struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
int err = 0;
int ret;
- int i;
- int nritems;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = min_offset;
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3667,9 +3997,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3677,7 +4008,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -3686,29 +4017,31 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
- if (key_type == tmp.type)
- first_offset = max(min_offset, tmp.offset) + 1;
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
goto done;
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (key_type == tmp.type) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
btrfs_release_path(path);
@@ -3730,62 +4063,14 @@ search:
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
-
- if (need_resched()) {
- btrfs_release_path(path);
- cond_resched();
- goto search;
- }
-
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
+ if (ret != 0) {
+ if (ret < 0)
err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3799,21 +4084,29 @@ search:
err = ret;
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret)
- err = ret;
- else
- last_offset = tmp.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
@@ -3822,13 +4115,21 @@ done:
if (err == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, key_type,
- ino, first_offset, last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset);
+ if (last_old_dentry_offset < last_offset) {
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
}
return err;
}
@@ -3846,7 +4147,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3854,13 +4155,13 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 min_key;
u64 max_key;
int ret;
- int key_type = BTRFS_DIR_ITEM_KEY;
-again:
- min_key = 0;
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3869,10 +4170,8 @@ again:
min_key = max_key + 1;
}
- if (key_type == BTRFS_DIR_ITEM_KEY) {
- key_type = BTRFS_DIR_INDEX_KEY;
- goto again;
- }
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
+
return 0;
}
@@ -3882,17 +4181,18 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -3909,7 +4209,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
@@ -3934,6 +4234,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
+
+ return btrfs_truncate_inode_items(trans, log_root, &control);
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
@@ -4096,22 +4411,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- unsigned long src_offset;
- unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- int ret;
+ int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
+ struct btrfs_item_batch batch;
char *ins_data;
int i;
- struct list_head ordered_sums;
- int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
-
- INIT_LIST_HEAD(&ordered_sums);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4120,27 +4431,155 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = 0;
+ dst_index = 0;
for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
- }
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
- if (ret) {
- kfree(ins_data);
- return ret;
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
}
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ goto out;
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
@@ -4148,70 +4587,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ src_offset, ins_sizes[dst_index]);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
-
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
-
- ret = btrfs_lookup_csums_range(
- fs_info->csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret)
- break;
- }
- }
+ dst_index++;
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
+out:
kfree(ins_data);
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = log_csums(trans, inode, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
-
return ret;
}
@@ -4237,6 +4623,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *csum_root;
u64 csum_offset;
u64 csum_len;
u64 mod_start = em->mod_start;
@@ -4317,7 +4704,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
+ csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
+ ret = btrfs_lookup_csums_range(csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
@@ -4338,33 +4726,64 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
- struct btrfs_map_token token;
struct btrfs_key key;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
+ extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- drop_args.path = path;
- drop_args.start = em->start;
- drop_args.end = em->start + em->len;
- drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
- ret = btrfs_drop_extents(trans, log, inode, &drop_args);
- if (ret)
- return ret;
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (ctx->logged_before) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
@@ -4372,44 +4791,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
+ sizeof(fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
- btrfs_init_map_token(&token, leaf);
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_PREALLOC);
- else
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_REG);
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start -
- extent_offset);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
- }
-
- btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
- btrfs_set_token_file_extent_encryption(&token, fi, 0);
- btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -4419,7 +4808,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4522,13 +4911,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY,
- NULL);
- } while (ret == -EAGAIN);
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
@@ -4555,7 +4940,6 @@ out:
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
@@ -4620,7 +5004,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4628,7 +5012,6 @@ process:
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
if (ret)
@@ -4709,11 +5092,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4787,10 +5170,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4932,7 +5315,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
@@ -5067,7 +5450,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
- ret = btrfs_log_inode(trans, root,
+ ret = btrfs_log_inode(trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
@@ -5127,8 +5510,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, ctx);
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5204,6 +5586,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5224,13 +5607,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5239,7 +5630,7 @@ again:
&other_ino, &other_parent);
if (ret < 0) {
return ret;
- } else if (ret > 0 && ctx &&
+ } else if (ret > 0 &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
@@ -5261,10 +5652,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5316,10 +5705,29 @@ next_key:
} else {
break;
}
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5339,7 +5747,7 @@ next_key:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -5347,9 +5755,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
- int err = 0;
- int ret = 0;
+ struct btrfs_root *log = inode->root->log_root;
+ int ret;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5358,6 +5765,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool xattrs_logged = false;
bool recursive_logging = false;
bool inode_item_dropped = true;
+ const bool orig_logged_before = ctx->logged_before;
path = btrfs_alloc_path();
if (!path)
@@ -5389,22 +5797,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* Only run delayed items if we are a directory. We want to make sure
* all directory indexes hit the fs/subvolume tree so we can find them
* and figure out which index ranges have to be logged.
- *
- * Otherwise commit the delayed inode only if the full sync flag is set,
- * as we want to make sure an up to date version is in the subvolume
- * tree so copy_inode_items_to_log() / copy_items() can find it and copy
- * it to the log tree. For a non full sync, we always log the inode item
- * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode))
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (ret)
+ goto out;
}
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5419,6 +5816,29 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
* This is for cases where logging a directory could result in losing a
* a file after replaying the log. For example, if we move a file from a
* directory A to a directory B, then fsync directory A, we have no way
@@ -5429,7 +5849,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- err = 1;
+ ret = 1;
goto out_unlock;
}
@@ -5443,9 +5863,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5459,27 +5881,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- err = logged_inode_size(log, inode, path, &logged_isize);
- if (err)
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0, NULL);
- if (ret != -EAGAIN)
- break;
- }
+ if (ctx->logged_before)
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5487,8 +5907,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5497,37 +5918,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
- err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
recursive_logging, inode_only, ctx,
&need_log_inode_item);
- if (err)
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
- if (err)
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
- if (err)
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
goto out_unlock;
/*
* If we are doing a fast fsync and the inode was logged before
@@ -5538,20 +5957,16 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx);
- if (ret) {
- err = ret;
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
+ if (ret)
goto out_unlock;
- }
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
@@ -5562,62 +5977,65 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
- if (ret) {
- err = ret;
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
+ if (ret)
goto out_unlock;
- }
}
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
/*
- * If we are logging that an ancestor inode exists as part of logging a
- * new name from a link or rename operation, don't mark the inode as
- * logged - otherwise if an explicit fsync is made against an ancestor,
- * the fsync considers the inode in the log and doesn't sync the log,
- * resulting in the ancestor missing after a power failure unless the
- * log was synced as part of an fsync against any other unrelated inode.
- * So keep it simple for this case and just don't flag the ancestors as
- * logged.
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
*/
- if (!ctx ||
- !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
- &inode->vfs_inode != ctx->inode)) {
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- /*
- * Don't update last_log_commit if we logged that an inode exists.
- * We do this for two reasons:
- *
- * 1) We might have had buffered writes to this inode that were
- * flushed and had their ordered extents completed in this
- * transaction, but we did not previously log the inode with
- * LOG_INODE_ALL. Later the inode was evicted and after that
- * it was loaded again and this LOG_INODE_EXISTS log operation
- * happened. We must make sure that if an explicit fsync against
- * the inode is performed later, it logs the new extents, an
- * updated inode item, etc, and syncs the log. The same logic
- * applies to direct IO writes instead of buffered writes.
- *
- * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
- * is logged with an i_size of 0 or whatever value was logged
- * before. If later the i_size of the inode is increased by a
- * truncate operation, the log is synced through an fsync of
- * some other inode and then finally an explicit fsync against
- * this inode is made, we must make sure this fsync logs the
- * inode with the new i_size, the hole between old i_size and
- * the new i_size, and syncs the log.
- */
- if (inode_only != LOG_INODE_EXISTS)
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
- }
+ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+
+ /*
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
+ */
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
+
out_unlock:
mutex_unlock(&inode->log_mutex);
-
+out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return err;
+
+ if (recursive_logging)
+ ctx->logged_before = orig_logged_before;
+
+ return ret;
}
/*
@@ -5689,18 +6107,12 @@ struct btrfs_dir_list {
* link_to_fixup_dir());
*
* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
* has a size that doesn't match the sum of the lengths of all the logged
- * names. This does not result in a problem because if a dir_item key is
- * logged but its matching dir_index key is not logged, at log replay time we
- * don't use it to replay the respective name (see replay_one_name()). On the
- * other hand if only the dir_index key ends up being logged, the respective
- * name is added to the fs/subvol tree with both the dir_item and dir_index
- * keys created (see replay_one_name()).
- * The directory's inode item with a wrong i_size is not a problem as well,
- * since we don't use it at log replay time to set the i_size in the inode
- * item of the fs/subvol tree (see overwrite_item()).
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
*/
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -5708,12 +6120,19 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *log = root->log_root;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
int ret = 0;
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5738,11 +6157,11 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
goto next_dir_inode;
min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = 0;
again:
btrfs_release_path(path);
- ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
if (ret < 0) {
goto next_dir_inode;
} else if (ret > 0) {
@@ -5750,7 +6169,6 @@ again:
goto next_dir_inode;
}
-process_leaf:
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
for (i = path->slots[0]; i < nritems; i++) {
@@ -5763,13 +6181,12 @@ process_leaf:
btrfs_item_key_to_cpu(leaf, &min_key, i);
if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_ITEM_KEY)
+ min_key.type != BTRFS_DIR_INDEX_KEY)
goto next_dir_inode;
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid &&
- type != BTRFS_FT_DIR)
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
continue;
btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
if (di_key.type == BTRFS_ROOT_ITEM_KEY)
@@ -5788,9 +6205,9 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
btrfs_add_delayed_iput(di_inode);
if (ret)
@@ -5807,16 +6224,6 @@ process_leaf:
}
break;
}
- if (i == nritems) {
- ret = btrfs_next_leaf(log, path);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
- goto process_leaf;
- }
if (min_key.offset < (u64)-1) {
min_key.offset++;
goto again;
@@ -5875,7 +6282,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
@@ -5934,11 +6341,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
continue;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret && ctx && ctx->log_new_dentries)
+ if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
@@ -5984,7 +6390,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >= trans->transid &&
need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
@@ -6039,7 +6445,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation >= trans->transid &&
need_log_inode(trans, inode)) {
- ret = btrfs_log_inode(trans, root, inode,
+ ret = btrfs_log_inode(trans, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
break;
@@ -6182,7 +6588,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6199,7 +6605,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
log_dentries = true;
/*
@@ -6325,8 +6731,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6339,8 +6744,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6357,8 +6761,7 @@ again:
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6386,8 +6789,7 @@ again:
if (!ret)
goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6395,14 +6797,15 @@ again:
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
- btrfs_handle_fs_error(fs_info, ret,
- "failed to record the log root in transaction");
+ btrfs_abort_transaction(trans, ret);
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6419,6 +6822,8 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
wc.replay_dest->log_root = NULL;
@@ -6549,15 +6954,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+/**
+ * Update the log after adding a new name for an inode.
+ *
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
+ *
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6570,27 +6992,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (!inode_logged(trans, inode) &&
- (!old_dir || !inode_logged(trans, old_dir)))
- return;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
/*
* If we are doing a rename (old_dir is not NULL) from a directory that
- * was previously logged, make sure the next log attempt on the directory
- * is not skipped and logs the inode again. This is because the log may
- * not currently be authoritative for a range including the old
- * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
- * sure after a log replay we do not end up with both the new and old
- * dentries around (in case the inode is a directory we would have a
- * directory with two hard links and 2 inode references for different
- * parents). The next log attempt of old_dir will happen at
- * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
- * below, because we have previously set inode->last_unlink_trans to the
- * current transaction ID, either here or at btrfs_record_unlink_dir() in
- * case inode is a directory.
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
*/
- if (old_dir)
- old_dir->logged_trans = 0;
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ log_pinned = true;
+ btrfs_pin_log_trans(root);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto out;
+ }
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
@@ -6602,5 +7080,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 731bd9c029f5..1620f8170629 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,10 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Indicate if the inode being logged was logged before. */
+ bool logged_before;
+ /* Tracks the last logged dir item/index key offset. */
+ u64 last_dir_item_offset;
struct inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
@@ -30,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
@@ -68,14 +73,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
@@ -84,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent);
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent);
#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 74023c8a783f..b458452a1aaf 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
offset = btrfs_item_ptr_offset(eb, slot);
ret = -ENOENT;
@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+ offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
} else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
goto out;
}
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size == sizeof(subid)) {
ret = btrfs_del_item(trans, uuid_root, path);
goto out;
@@ -331,7 +331,7 @@ again_search_slot:
goto skip;
offset = btrfs_item_ptr_offset(leaf, slot);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info,
"uuid item with illegal size %lu!",
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 4968535dfff0..90eb5c2830a9 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
break;
- item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
if (copied > 0) {
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ec3b8ac8fa3..a8cc736731fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -33,6 +34,10 @@
#include "discard.h"
#include "zoned.h"
+#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10 | \
+ BTRFS_BLOCK_GROUP_RAID56_MASK)
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -250,7 +255,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map);
/*
@@ -508,7 +513,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
@@ -529,30 +534,20 @@ error:
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
-{
- int found;
-
- rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
- rcu_read_unlock();
-
- return found == 0;
-}
-
-/*
- * Search and remove all stale (devices which are not mounted) devices.
+/**
+ * Search and remove all stale devices (which are not mounted).
* When both inputs are NULL, it will search and release all stale devices.
- * path: Optional. When provided will it release all unmounted devices
- * matching this path only.
- * skip_dev: Optional. Will skip this device when searching for the stale
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
* devices.
- * Return: 0 for success or if @path is NULL.
- * -EBUSY if @path is a mounted device.
- * -ENOENT if @path does not match any device in the list.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
*/
-static int btrfs_free_stale_devices(const char *path,
- struct btrfs_device *skip_device)
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
@@ -560,7 +555,7 @@ static int btrfs_free_stale_devices(const char *path,
lockdep_assert_held(&uuid_mutex);
- if (path)
+ if (devt)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
@@ -570,13 +565,11 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
- if (path && ret != 0)
+ if (devt && ret != 0)
ret = -EBUSY;
break;
}
@@ -609,7 +602,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
- struct request_queue *q;
struct block_device *bdev;
struct btrfs_super_block *disk_super;
u64 devid;
@@ -651,8 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- q = bdev_get_queue(bdev);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -776,11 +767,17 @@ static noinline struct btrfs_device *device_list_add(const char *path,
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int error;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+ error = lookup_bdev(path, &path_devt);
+ if (error)
+ return ERR_PTR(error);
+
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
@@ -812,9 +809,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
mutex_lock(&fs_devices->device_list_mutex);
- device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL);
+ device = btrfs_find_device(fs_devices, &args);
/*
* If this disk has been pulled into an fs devices created by
@@ -859,6 +860,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
@@ -919,25 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
- int error;
- dev_t path_dev;
-
- error = lookup_bdev(path, &path_dev);
- if (error) {
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(error);
- }
-
- if (device->bdev->bd_dev != path_dev) {
+ if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -945,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
@@ -963,6 +955,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
+ device->devt = path_devt;
}
/*
@@ -1091,7 +1084,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
__btrfs_free_extra_devids(seed_dev, &latest_dev);
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
@@ -1122,8 +1115,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
if (device->bdev) {
@@ -1155,7 +1150,6 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list));
- ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -1222,7 +1216,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1286,7 +1280,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
@@ -1321,12 +1315,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return disk_super;
}
-int btrfs_forget_devices(const char *path)
+int btrfs_forget_devices(dev_t devt)
{
int ret;
mutex_lock(&uuid_mutex);
- ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ ret = btrfs_free_stale_devices(devt, NULL);
mutex_unlock(&uuid_mutex);
return ret;
@@ -1363,8 +1357,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bytenr_orig = btrfs_sb_offset(0);
ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
- if (ret)
- return ERR_PTR(ret);
+ if (ret) {
+ device = ERR_PTR(ret);
+ goto error_bdev_put;
+ }
disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
@@ -1373,10 +1369,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
}
device = device_list_add(path, disk_super, &new_device_added);
- if (!IS_ERR(device)) {
- if (new_device_added)
- btrfs_free_stale_devices(path, device);
- }
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
btrfs_release_disk_super(disk_super);
@@ -1843,8 +1837,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1882,60 +1878,52 @@ out:
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
{
- struct inode *inode = bdev->bd_inode;
+ struct path path;
struct timespec64 now;
+ int ret;
- /* Shouldn't happen but just in case. */
- if (!inode)
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- now = current_time(inode);
- generic_update_time(inode, &now, S_MTIME | S_CTIME);
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1986,7 +1974,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
}
/*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
@@ -2005,8 +1993,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
- if (fs_info->fs_devices->latest_bdev == device->bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
}
/*
@@ -2069,55 +2057,58 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(bdev);
+ update_dev_time(device_path);
}
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
-
- device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+ return ret;
- if (IS_ERR(device)) {
- if (PTR_ERR(device) == -ENOENT &&
- device_path && strcmp(device_path, "missing") == 0)
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
- ret = PTR_ERR(device);
- goto out;
+ ret = -ENOENT;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2126,22 +2117,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
- if (!ret)
- btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2159,7 +2154,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
- * its own fs_devices listed under the fs_devices->seed.
+ * its own fs_devices listed under the fs_devices->seed_list.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
@@ -2210,18 +2205,25 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
synchronize_rcu();
btrfs_free_device(device);
- if (cur_devices->open_devices == 0) {
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- close_fs_devices(cur_devices);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
free_fs_devices(cur_devices);
}
-out:
- mutex_unlock(&uuid_mutex);
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
- btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
@@ -2229,7 +2231,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
@@ -2305,13 +2307,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * The update_dev_time() with in btrfs_scratch_superblocks()
- * may lead to a call to btrfs_show_devname() which will try
- * to hold device_list_mutex. And here this device
- * is already out of device list, so we don't have to hold
- * the device_list_mutex lock.
- */
btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
tgtdev->name->str);
@@ -2320,86 +2315,109 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-static struct btrfs_device *btrfs_find_device_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
{
- int ret = 0;
struct btrfs_super_block *disk_super;
- u64 devid;
- u8 *dev_uuid;
struct block_device *bdev;
- struct btrfs_device *device;
+ int ret;
- ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &disk_super);
- if (ret)
- return ERR_PTR(ret);
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret)
+ return ret;
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid);
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
else
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid);
-
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- if (!device)
- device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return device;
+ return 0;
}
/*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
*/
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *device;
+ int ret;
if (devid) {
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
- if (!device_path || !device_path[0])
- return ERR_PTR(-EINVAL);
-
- if (strcmp(device_path, "missing") == 0) {
- /* Find first missing device */
- list_for_each_entry(device, &fs_info->fs_devices->devices,
- dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state) && !device->bdev)
- return device;
- }
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
return ERR_PTR(-ENOENT);
- }
-
- return btrfs_find_device_by_path(fs_info, device_path);
+ return device;
}
-/*
- * does all the dirty work required for changing file system's UUID.
- */
-static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
+static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct btrfs_device *device;
- u64 super_flags;
lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
* Private copy of the seed devices, anchored at
@@ -2407,7 +2425,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
- return PTR_ERR(seed_devices);
+ return seed_devices;
/*
* It's necessary to retain a copy of the original seed fs_devices in
@@ -2418,7 +2436,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
- return PTR_ERR(old_devices);
+ return old_devices;
}
list_add(&old_devices->fs_list, &fs_uuids);
@@ -2429,7 +2447,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
- mutex_lock(&fs_devices->device_list_mutex);
+ return seed_devices;
+}
+
+/*
+ * Splice seed devices into the sprout fs_devices.
+ * Generate a new fsid for the sprouted read-write filesystem.
+ */
+static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *seed_devices)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ /*
+ * We are updating the fsid, the thread leading to device_list_add()
+ * could race, so uuid_mutex is needed.
+ */
+ lockdep_assert_held(&uuid_mutex);
+
+ /*
+ * The threads listed below may traverse dev_list but can do that without
+ * device_list_mutex:
+ * - All device ops and balance - as we are in btrfs_exclop_start.
+ * - Various dev_list readers - are using RCU.
+ * - btrfs_ioctl_fitrim() - is using RCU.
+ *
+ * For-read threads as below are using device_list_mutex:
+ * - Readonly scrub btrfs_scrub_dev()
+ * - Readonly scrub btrfs_scrub_progress()
+ * - btrfs_get_dev_stats()
+ */
+ lockdep_assert_held(&fs_devices->device_list_mutex);
+
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
@@ -2445,13 +2497,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
- mutex_unlock(&fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
-
- return 0;
}
/*
@@ -2459,6 +2508,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
@@ -2468,7 +2518,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- u64 devid;
int ret;
path = btrfs_alloc_path();
@@ -2480,7 +2529,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2505,13 +2556,14 @@ next_slot:
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
- devid = btrfs_device_id(leaf, dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2532,17 +2584,17 @@ error:
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
- struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *seed_devices;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
- int seeding_dev = 0;
int ret = 0;
+ bool seeding_dev = false;
bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
@@ -2559,7 +2611,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
if (fs_devices->seeding) {
- seeding_dev = 1;
+ seeding_dev = true;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
locked = true;
@@ -2593,8 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->fs_info = fs_info;
device->bdev = bdev;
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error_free_device;
@@ -2604,14 +2659,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_free_zone;
}
- q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -2622,16 +2676,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
- ret = btrfs_prepare_sprout(fs_info);
- if (ret) {
+
+ /* GFP_KERNEL allocation must not be under device_list_mutex */
+ seed_devices = btrfs_init_sprout(fs_info);
+ if (IS_ERR(seed_devices)) {
+ ret = PTR_ERR(seed_devices);
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
+ mutex_lock(&fs_devices->device_list_mutex);
+ if (seeding_dev) {
+ btrfs_setup_sprout(fs_info, seed_devices);
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
+ }
+
device->fs_devices = fs_devices;
- mutex_lock(&fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
@@ -2643,7 +2706,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2693,7 +2756,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
/*
* fs_devices now represents the newly sprouted filesystem and
- * its fsid has been changed by btrfs_prepare_sprout
+ * its fsid has been changed by btrfs_sprout_splice().
*/
btrfs_sysfs_update_sprout_fsid(fs_devices);
}
@@ -2730,10 +2793,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
- btrfs_forget_devices(device_path);
+ btrfs_forget_devices(device->devt);
/* Update ctime/mtime for blkid or udev */
- update_dev_time(bdev);
+ update_dev_time(device_path);
return ret;
@@ -2826,6 +2889,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2854,7 +2918,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3164,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3162,6 +3230,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u64 length;
int ret;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -4301,8 +4375,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
- if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ }
/*
* Balance can be canceled by:
*
@@ -4354,10 +4430,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -4378,6 +4456,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
/*
* A ro->rw remount sequence should continue with the paused balance
* regardless of who pauses it, system or the user as of now, so set
@@ -4446,7 +4528,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4587,7 +4669,7 @@ int btrfs_uuid_scan_kthread(void *data)
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
@@ -4889,8 +4971,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4973,7 +5057,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
/*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
* Wraps needed parameters.
*/
struct alloc_chunk_ctl {
@@ -5377,7 +5461,7 @@ error_del_extent:
return block_group;
}
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -5446,7 +5530,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
struct btrfs_chunk *chunk;
@@ -5518,7 +5601,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
}
btrfs_set_stack_chunk_length(chunk, bg->length);
- btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
@@ -5578,12 +5661,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -5597,17 +5680,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return btrfs_raid_array[index].tolerated_failures;
}
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- int readonly = 0;
int miss_ndevs = 0;
int i;
+ bool ret = true;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
- return 1;
+ return false;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
@@ -5618,21 +5701,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
- readonly = 1;
+ ret = false;
goto end;
}
}
/*
- * If the number of missing devices is larger than max errors,
- * we can not write the data into that chunk successfully, so
- * set it readonly.
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
- readonly = 1;
+ ret = false;
end:
free_extent_map(em);
- return readonly;
+ return ret;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5795,7 +5877,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
{
int i;
int again = 1;
@@ -5804,52 +5886,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
/* Swap if parity is on a smaller index */
- if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
- swap(bbio->stripes[i], bbio->stripes[i + 1]);
- swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
again = 1;
}
}
}
}
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
{
- struct btrfs_bio *bbio = kzalloc(
- /* the size of the btrfs_bio */
- sizeof(struct btrfs_bio) +
- /* plus the variable array for the stripes */
- sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- /* plus the variable array for the tgt dev */
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
- * plus the raid_map, which includes both the tgt dev
- * and the stripes
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bbio->error, 0);
- refcount_set(&bbio->refs, 1);
+ atomic_set(&bioc->error, 0);
+ refcount_set(&bioc->refs, 1);
- bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
- bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
- return bbio;
+ return bioc;
}
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
- WARN_ON(!refcount_read(&bbio->refs));
- refcount_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
}
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
- if (!bbio)
+ if (!bioc)
return;
- if (refcount_dec_and_test(&bbio->refs))
- kfree(bbio);
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5859,11 +5944,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5882,8 +5967,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
int ret = 0;
int i;
- /* discard always return a bbio */
- ASSERT(bbio_ret);
+ /* Discard always returns a bioc. */
+ ASSERT(bioc_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
@@ -5946,26 +6031,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&stripe_index);
}
- bbio = alloc_btrfs_bio(num_stripes, 0);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
+ bioc->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
+ bioc->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
+ bioc->stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -5976,19 +6060,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
+ bioc->stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
+ bioc->stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bbio->stripes[i].length = length;
+ bioc->stripes[i].length = length;
}
stripe_index++;
@@ -5998,9 +6080,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
@@ -6024,7 +6106,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
@@ -6033,20 +6115,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bbio, 0, 0);
+ logical, &length, &bioc, 0, 0);
if (ret) {
- ASSERT(bbio == NULL);
+ ASSERT(bioc == NULL);
return ret;
}
- num_stripes = bbio->num_stripes;
+ num_stripes = bioc->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return -EIO;
}
@@ -6056,7 +6138,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid != srcdev_devid)
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
continue;
/*
@@ -6064,15 +6146,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* mirror with the lowest physical address
*/
if (found &&
- physical_of_found <= bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
ASSERT(found);
if (!found)
@@ -6103,12 +6185,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_bio *bbio = *bbio_ret;
+ struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
@@ -6138,17 +6220,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
+ bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
@@ -6168,30 +6250,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
}
if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
+ bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
@@ -6200,8 +6281,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
- bbio->num_tgtdevs = tgtdev_indexes;
- *bbio_ret = bbio;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6258,7 +6339,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ /* Only stripe based profiles needs to check against stripe length. */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
u64 max_len = stripe_len - stripe_offset;
/*
@@ -6304,7 +6386,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
@@ -6319,7 +6401,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
@@ -6328,7 +6410,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
- ASSERT(bbio_ret);
+ ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6472,20 +6554,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
- bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ bioc->stripes[i].physical = map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
stripe_index++;
}
- /* build raid_map */
+ /* Build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
@@ -6497,15 +6579,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
- bbio->raid_map[(i+rot) % num_stripes] =
+ bioc->raid_map[(i + rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bbio->raid_map[(i+rot+1) % num_stripes] =
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
RAID6_Q_STRIPE;
- sort_parity_stripes(bbio, num_stripes);
+ sort_parity_stripes(bioc, num_stripes);
}
if (need_full_stripe(op))
@@ -6513,15 +6595,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6530,9 +6612,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
- bbio->stripes[0].dev = dev_replace->tgtdev;
- bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- bbio->mirror_num = map->num_stripes + 1;
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
@@ -6546,43 +6628,43 @@ out:
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num)
+ struct btrfs_io_context **bioc_ret, int mirror_num)
{
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ length, bioc_ret);
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
{
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
+ bio->bi_private = bioc->private;
+ bio->bi_end_io = bioc->end_io;
bio_endio(bio);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_io_context *bioc = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
- atomic_inc(&bbio->error);
+ atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+ struct btrfs_device *dev = btrfs_bio(bio)->device;
ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6597,22 +6679,22 @@ static void btrfs_end_bio(struct bio *bio)
}
}
- if (bio == bbio->orig_bio)
+ if (bio == bioc->orig_bio)
is_orig_bio = 1;
- btrfs_bio_counter_dec(bbio->fs_info);
+ btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
- bio = bbio->orig_bio;
+ bio = bioc->orig_bio;
}
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
- if (atomic_read(&bbio->error) > bbio->max_errors) {
+ if (atomic_read(&bioc->error) > bioc->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
@@ -6622,19 +6704,19 @@ static void btrfs_end_bio(struct bio *bio)
bio->bi_status = BLK_STS_OK;
}
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
u64 physical, struct btrfs_device *dev)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
- bio->bi_private = bbio;
- btrfs_io_bio(bio)->device = dev;
+ bio->bi_private = bioc;
+ btrfs_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
@@ -6663,20 +6745,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfsic_submit_bio(bio);
}
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
{
- atomic_inc(&bbio->error);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
/* Should be the original bio. */
- WARN_ON(bio != bbio->orig_bio);
+ WARN_ON(bio != bioc->orig_bio);
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bbio->error) > bbio->max_errors)
+ if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
}
}
@@ -6691,36 +6773,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int ret;
int dev_nr;
int total_devs;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bbio, mirror_num, 1);
+ &map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
- total_devs = bbio->num_stripes;
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = fs_info;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ total_devs = bioc->num_stripes;
+ bioc->orig_bio = first_bio;
+ bioc->private = first_bio->bi_private;
+ bioc->end_io = first_bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, bioc->num_stripes);
- if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(fs_info, bio, bbio,
- map_length);
+ ret = raid56_parity_write(bio, bioc, map_length);
} else {
- ret = raid56_parity_recover(fs_info, bio, bbio,
- map_length, mirror_num, 1);
+ ret = raid56_parity_recover(bio, bioc, map_length,
+ mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
@@ -6735,12 +6815,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bbio->stripes[dev_nr].dev;
+ dev = bioc->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bbio_error(bbio, first_bio, logical);
+ bioc_error(bioc, first_bio, logical);
continue;
}
@@ -6749,12 +6829,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ ASSERT((args->devid != (u64)-1) || args->missing);
+
+ if ((args->devid != (u64)-1) && device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ if (!args->missing)
+ return true;
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+}
+
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
@@ -6762,31 +6869,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
- if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ if (dev_args_match_fs_devices(args, fs_devices)) {
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
+ if (dev_args_match_device(args, device))
return device;
}
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- if (!fsid ||
- !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &seed_devs->devices,
- dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
- return device;
- }
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
}
}
@@ -6862,11 +6963,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
- atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state,
IO_TREE_DEVICE_ALLOC_STATE, NULL);
@@ -6949,9 +7047,31 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
}
#endif
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
+
+ return dev;
+}
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
@@ -7029,33 +7149,23 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL);
- if (!map->stripes[i].dev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
- free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid, true);
- return -ENOENT;
- }
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!map->stripes[i].dev) {
- map->stripes[i].dev =
- add_missing_dev(fs_info->fs_devices, devid,
- uuid);
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- btrfs_err(fs_info,
- "failed to init missing dev %llu: %ld",
- devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid, false);
}
+
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
-
}
write_lock(&map_tree->lock);
@@ -7151,6 +7261,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -7159,11 +7270,13 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = btrfs_device_id(leaf, dev_item);
+ devid = args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7171,8 +7284,7 @@ static int read_one_dev(struct extent_buffer *leaf,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7236,7 +7348,7 @@ static int read_one_dev(struct extent_buffer *leaf,
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
- u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
@@ -7482,6 +7594,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
fs_info->fs_devices->total_rw_bytes = 0;
/*
+ * Lockdep complains about possible circular locking dependency between
+ * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
+ * used for freeze procection of a fs (struct super_block.s_writers),
+ * which we take when starting a transaction, and extent buffers of the
+ * chunk tree if we call read_one_dev() while holding a lock on an
+ * extent buffer of the chunk tree. Since we are mounting the filesystem
+ * and at this point there can't be any concurrent task modifying the
+ * chunk tree, to keep it simple, just skip locking on the chunk tree.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ path->skip_locking = 1;
+
+ /*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
@@ -7506,10 +7631,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
goto error;
break;
}
- /*
- * The nodes on level 1 are not locked but we don't need to do
- * that during mount time as nothing else can access the tree
- */
node = path->nodes[1];
if (node) {
if (last_ra_node != node->start) {
@@ -7537,7 +7658,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* requirement for chunk allocation, see the comment on
* top of btrfs_chunk_alloc() for details.
*/
- ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = read_one_chunk(&found_key, leaf, chunk);
if (ret)
@@ -7643,7 +7763,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
@@ -7721,7 +7841,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
@@ -7841,12 +7961,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7922,6 +8044,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -7977,7 +8100,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device boundary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
@@ -8173,10 +8296,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
+ sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
+ sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8204,27 +8329,31 @@ out:
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return ret;
}
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
struct btrfs_block_group *cache;
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
/* Do not attempt to repair in degraded state */
if (btrfs_test_opt(fs_info, DEGRADED))
- return 0;
+ return true;
cache = btrfs_lookup_block_group(fs_info, logical);
if (!cache)
- return 0;
+ return true;
spin_lock(&cache->lock);
if (cache->relocating_repair) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
- return 0;
+ return true;
}
cache->relocating_repair = 1;
spin_unlock(&cache->lock);
@@ -8232,5 +8361,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
- return 0;
+ return true;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2183361db614..f3e28f11cfb6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -72,6 +72,11 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
unsigned long dev_state;
blk_status_t last_flush_error;
@@ -123,13 +128,6 @@ struct btrfs_device {
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
- /* readahead state */
- atomic_t reada_in_flight;
- u64 reada_next;
- struct reada_zone *reada_curr_zone;
- struct radix_tree_root reada_zones;
- struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
@@ -236,17 +234,40 @@ struct btrfs_fs_devices {
bool fsid_change;
struct list_head fs_list;
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
u64 missing_devices;
u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
u64 total_devices;
/* Highest generation number of seen devices */
u64 latest_generation;
- struct block_device *latest_bdev;
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
@@ -300,48 +321,65 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet. We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios. We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
*/
-struct btrfs_io_bio {
+struct btrfs_bio {
unsigned int mirror_num;
+
+ /* for direct I/O */
+ u64 file_offset;
+
+ /* @device is for stripe IO submission. */
struct btrfs_device *device;
- u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_io_bio but relies on bio being last.
+ * bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
{
- return container_of(bio, struct btrfs_io_bio, bio);
+ return container_of(bio, struct btrfs_bio, bio);
}
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
- if (io_bio->csum != io_bio->csum_inline) {
- kfree(io_bio->csum);
- io_bio->csum = NULL;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
}
}
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
@@ -361,7 +399,7 @@ struct btrfs_bio {
* so raid_map[0] is the start of our full stripe
*/
u64 *raid_map;
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
struct btrfs_device_info {
@@ -396,11 +434,11 @@ struct map_lookup {
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+ (sizeof(struct btrfs_io_stripe) * (n)))
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -414,6 +452,22 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
@@ -437,20 +491,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num);
+ struct btrfs_io_context **bioc_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret);
+ struct btrfs_io_context **bioc_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -459,7 +513,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
-int btrfs_forget_devices(const char *path);
+int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -467,19 +521,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
u64 devid,
const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid,
+ struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +551,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
@@ -580,6 +638,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 8a4514283a4b..85691dc2232f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* matches our target xattr, so lets check.
*/
ret = 0;
- btrfs_assert_tree_locked(path->nodes[0]);
+ btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const int slot = path->slots[0];
struct extent_buffer *leaf = path->nodes[0];
const u16 old_data_len = btrfs_dir_data_len(leaf, di);
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
const u32 data_size = sizeof(*di) + name_len + size;
- struct btrfs_item *item;
unsigned long data_ptr;
char *ptr;
@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_extend_item(path, data_size);
}
- item = btrfs_item_nr(slot);
ptr = btrfs_item_ptr(leaf, slot, char);
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ptr += btrfs_item_size(leaf, slot) - data_size;
di = (struct btrfs_dir_item *)ptr;
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
@@ -264,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (start_trans)
btrfs_end_transaction(trans);
@@ -335,7 +334,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next_item;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
cur = 0;
while (cur < item_size) {
u16 name_len = btrfs_dir_name_len(leaf, di);
@@ -405,10 +404,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
- ret = btrfs_validate_prop(name, value, size);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
if (ret)
return ret;
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -418,7 +420,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 47af1ab3bf12..d31b0eda210f 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -4,6 +4,8 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
@@ -39,12 +41,30 @@
#define BTRFS_NR_SB_LOG_ZONES 2
/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
* Maximum supported zone size. Currently, SMR disks have a zone size of
* 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
* expect the zone size to become larger than 8GiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -60,14 +80,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
+ int i;
- ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
- zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
- empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
- empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
- full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
- full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
/*
* Possible states of log buffer zones
@@ -195,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
struct blk_zone *zones, unsigned int *nr_zones)
{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zno;
int ret;
if (!*nr_zones)
@@ -206,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
return 0;
}
+ /* Check cache */
+ if (zinfo->zone_cache) {
+ unsigned int i;
+
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ zno = pos >> zinfo->zone_size_shift;
+ /*
+ * We cannot report zones beyond the zone end. So, it is OK to
+ * cap *nr_zones to at the end.
+ */
+ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+ for (i = 0; i < *nr_zones; i++) {
+ struct blk_zone *zone_info;
+
+ zone_info = &zinfo->zone_cache[zno + i];
+ if (!zone_info->len)
+ break;
+ }
+
+ if (i == *nr_zones) {
+ /* Cache hit on all the zones */
+ memcpy(zones, zinfo->zone_cache + zno,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+ return 0;
+ }
+ }
+
ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
copy_zone_info_cb, zones);
if (ret < 0) {
@@ -219,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
if (!ret)
return -EIO;
+ /* Populate cache */
+ if (zinfo->zone_cache)
+ memcpy(zinfo->zone_cache + zno, zones,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+
return 0;
}
@@ -282,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, true);
if (ret)
break;
}
@@ -291,11 +345,14 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
return ret;
}
-int btrfs_get_dev_zone_info(struct btrfs_device *device)
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ unsigned int max_active_zones;
+ unsigned int nactive;
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -318,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return -ENOMEM;
+ device->zone_info = zone_info;
+
if (!bdev_is_zoned(bdev)) {
if (!fs_info->zone_size) {
ret = calculate_emulated_zone_size(fs_info);
@@ -351,6 +410,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ max_active_zones = queue_max_active_zones(queue);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -363,13 +433,37 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
goto out;
}
+ /*
+ * Enable zone cache only for a zoned device. On a non-zoned device, we
+ * fill the zone info with emulated CONVENTIONAL zones, so no need to
+ * use the cache.
+ */
+ if (populate_cache && bdev_is_zoned(device->bdev)) {
+ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+ zone_info->nr_zones);
+ if (!zone_info->zone_cache) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to allocate zone cache for %s",
+ rcu_str_deref(device->name));
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/* Get zones type */
+ nactive = 0;
while (sector < nr_sectors) {
nr_zones = BTRFS_REPORT_NR_ZONES;
ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +474,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
for (i = 0; i < nr_zones; i++) {
if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
__set_bit(nreported, zone_info->seq_zones);
- if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
__set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
nreported++;
}
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +499,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ }
+
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -444,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
kfree(zones);
- device->zone_info = zone_info;
-
switch (bdev_zoned_model(bdev)) {
case BLK_ZONED_HM:
model = "host-managed zoned";
@@ -478,10 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
out:
kfree(zones);
out_free_zone_info:
- bitmap_free(zone_info->empty_zones);
- bitmap_free(zone_info->seq_zones);
- kfree(zone_info);
- device->zone_info = NULL;
+ btrfs_destroy_dev_zone_info(device);
return ret;
}
@@ -493,8 +604,10 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return;
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->seq_zones);
bitmap_free(zone_info->empty_zones);
+ vfree(zone_info->zone_cache);
kfree(zone_info);
device->zone_info = NULL;
}
@@ -539,8 +652,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
if (model == BLK_ZONED_HM ||
(model == BLK_ZONED_HA && incompat_zoned) ||
(model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info =
- device->zone_info;
+ struct btrfs_zoned_device_info *zone_info;
zone_info = device->zone_info;
zoned_devices++;
@@ -585,7 +697,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
- * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +776,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
- ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+ ASSERT(sb_zone_is_full(reset));
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
reset->start, reset->len,
@@ -676,9 +788,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset->wp = reset->start;
}
} else if (ret != -ENOENT) {
- /* For READ, we want the precious one */
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
if (wp == zones[0].start << SECTOR_SHIFT)
- wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
wp -= BTRFS_SUPER_INFO_SIZE;
}
@@ -771,36 +894,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
return true;
}
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
struct blk_zone *zone;
+ int i;
if (!is_sb_log_zone(zinfo, mirror))
- return;
+ return 0;
zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
- if (zone->cond != BLK_ZONE_COND_FULL) {
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
if (zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
- if (zone->wp == zone->start + zone->len)
+ zone->wp = zone->start + zone->len;
zone->cond = BLK_ZONE_COND_FULL;
-
- return;
+ }
+ return 0;
}
- zone++;
- ASSERT(zone->cond != BLK_ZONE_COND_FULL);
- if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
- if (zone->wp == zone->start + zone->len)
- zone->cond = BLK_ZONE_COND_FULL;
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
}
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +1038,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
return pos;
}
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
@@ -910,6 +1088,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
*bytes = length;
while (length) {
btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
physical += device->zone_info->zone_size;
length -= device->zone_info->zone_size;
}
@@ -974,7 +1153,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
u64 *offset_ret)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -989,6 +1168,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
key.type = 0;
key.offset = 0;
+ root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
if (!ret)
@@ -1034,11 +1214,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
- u64 physical = 0;
int ret;
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ u64 *physical = NULL;
+ unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1063,10 +1245,34 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
map = em->map_lookup;
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
if (!alloc_offsets) {
- free_extent_map(em);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
+ if (!physical) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
}
for (i = 0; i < map->num_stripes; i++) {
@@ -1076,14 +1282,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int dev_replace_is_ongoing = 0;
device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ physical[i] = map->stripes[i].physical;
if (device->bdev == NULL) {
alloc_offsets[i] = WP_MISSING_DEV;
continue;
}
- is_sequential = btrfs_dev_is_sequential(device, physical);
+ is_sequential = btrfs_dev_is_sequential(device, physical[i]);
if (is_sequential)
num_sequential++;
else
@@ -1098,21 +1304,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* This zone will be used for allocation, so mark this zone
* non-empty.
*/
- btrfs_dev_clear_zone_empty(device, physical);
+ btrfs_dev_clear_zone_empty(device, physical[i]);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
up_read(&dev_replace->rwsem);
/*
* The group is mapped to a sequential zone. Get the zone write
* pointer to determine the allocation offset within the zone.
*/
- WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+ WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical, &zone);
+ ret = btrfs_get_dev_zone(device, physical[i], &zone);
memalloc_nofs_restore(nofs_flag);
if (ret == -EIO || ret == -EOPNOTSUPP) {
ret = 0;
@@ -1131,12 +1337,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical >> device->zone_info->zone_size_shift,
+ physical[i] >> device->zone_info->zone_size_shift,
rcu_str_deref(device->name), device->devid);
alloc_offsets[i] = WP_MISSING_DEV;
break;
@@ -1144,14 +1352,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = fs_info->zone_size;
+ alloc_offsets[i] = caps[i];
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
break;
}
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
}
if (num_sequential > 0)
@@ -1169,6 +1385,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* calculate_alloc_pointer() which takes extent buffer
* locks to avoid deadlock.
*/
+
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
if (new) {
cache->alloc_offset = 0;
goto out;
@@ -1190,13 +1409,51 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
if (alloc_offsets[0] == WP_MISSING_DEV) {
btrfs_err(fs_info,
"zoned: cannot recover write pointer for zone %llu",
- physical);
+ physical[0]);
ret = -EIO;
goto out;
}
cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA) {
+ btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[1] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[1]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[0] != alloc_offsets[1]) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ cache->zone_is_active = test_bit(0, active);
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = min(caps[0], caps[1]);
+ break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID0:
case BTRFS_BLOCK_GROUP_RAID10:
@@ -1210,6 +1467,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1218,6 +1482,14 @@ out:
ret = -EIO;
}
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1229,6 +1501,13 @@ out:
if (!ret)
cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (ret) {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(physical);
+ kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1243,17 +1522,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
- unusable = cache->alloc_offset - cache->used;
- free = cache->length - cache->alloc_offset;
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
-
- /* Should not have any excluded extents. Just in case, though */
- btrfs_free_excluded_extents(cache);
}
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1581,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!is_data_inode(&inode->vfs_inode))
return false;
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
@@ -1391,29 +1679,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (!btrfs_is_zoned(fs_info))
return true;
- cache = *cache_ret;
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!cache)
+ return true;
- if (cache && (eb->start < cache->start ||
- cache->start + cache->length <= eb->start)) {
+ if (cache->meta_write_pointer != eb->start) {
btrfs_put_block_group(cache);
cache = NULL;
- *cache_ret = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
}
- if (!cache)
- cache = btrfs_lookup_block_group(fs_info, eb->start);
-
- if (cache) {
- if (cache->meta_write_pointer != eb->start) {
- btrfs_put_block_group(cache);
- cache = NULL;
- ret = false;
- } else {
- cache->meta_write_pointer = eb->start + eb->len;
- }
-
- *cache_ret = cache;
- }
+ *cache_ret = cache;
return ret;
}
@@ -1440,27 +1718,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
struct blk_zone *zone)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 mapped_length = PAGE_SIZE;
unsigned int nofs_flag;
int nmirrors;
int i, ret;
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < PAGE_SIZE) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ btrfs_put_bioc(bioc);
return -EIO;
}
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
return -EINVAL;
nofs_flag = memalloc_nofs_save();
- nmirrors = (int)bbio->num_stripes;
+ nmirrors = (int)bioc->num_stripes;
for (i = 0; i < nmirrors; i++) {
- u64 physical = bbio->stripes[i].physical;
- struct btrfs_device *dev = bbio->stripes[i].dev;
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
/* Missing device */
if (!dev->bdev)
@@ -1523,10 +1801,275 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
map = em->map_lookup;
/* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
device = map->stripes[0].dev;
free_extent_map(em);
return device;
}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+ int i;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+
+ spin_lock(&block_group->lock);
+ if (block_group->zone_is_active) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+ }
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
+ spin_unlock(&block_group->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ int ret = 0;
+ int i;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ map = block_group->physical_map;
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if ((block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+ block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+
+ block_group->zone_is_active = 0;
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+
+ if (ret)
+ return ret;
+
+ btrfs_dev_clear_active_zone(device, physical);
+ }
+ btrfs_dec_block_group_ro(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
+ return 0;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+{
+ struct btrfs_fs_info *fs_info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones ||
+ atomic_read(&zinfo->active_zones_left)) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ if (logical + length < block_group->start + block_group->zone_capacity)
+ goto out;
+
+ spin_lock(&block_group->lock);
+
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+
+ block_group->zone_is_active = 0;
+ /* We should have consumed all the free space */
+ ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+ ASSERT(block_group->free_space_ctl->free_space == 0);
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (!device->zone_info->max_active_zones)
+ goto out;
+
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ btrfs_put_block_group(block_group);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->zone_info) {
+ vfree(device->zone_info->zone_cache);
+ device->zone_info->zone_cache = NULL;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 4b299705bb12..6dee76248cb4 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -8,6 +8,7 @@
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
+#include "btrfs_inode.h"
/*
* Block groups with more than this value (percents) of unusable space will be
@@ -23,8 +24,12 @@ struct btrfs_zoned_device_info {
u64 zone_size;
u8 zone_size_shift;
u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
unsigned long *seq_zones;
unsigned long *empty_zones;
+ unsigned long *active_zones;
+ struct blk_zone *zone_cache;
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
@@ -32,7 +37,7 @@ struct btrfs_zoned_device_info {
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
-int btrfs_get_dev_zone_info(struct btrfs_device *device);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
@@ -40,7 +45,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes);
@@ -66,6 +71,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -78,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i
return 0;
}
-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+ bool populate_cache)
{
return 0;
}
@@ -113,8 +126,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
return 0;
}
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
@@ -199,6 +214,28 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
return ERR_PTR(-EOPNOTSUPP);
}
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ u64 flags)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -317,4 +354,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->treelog_bg_lock);
}
+static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index f06b68040352..fc42dd0badd7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -28,10 +28,10 @@
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
+ zstd_parameters params = zstd_get_params(level, src_len);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -48,8 +48,8 @@ struct workspace {
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
- ZSTD_inBuffer in_buf;
- ZSTD_outBuffer out_buf;
+ zstd_in_buffer in_buf;
+ zstd_out_buffer out_buf;
};
/*
@@ -155,12 +155,12 @@ static void zstd_calc_ws_mem_sizes(void)
unsigned int level;
for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
- ZSTD_parameters params =
+ zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ zstd_cstream_workspace_bound(&params.cParams),
+ zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
zstd_ws_mem_sizes[level - 1] = max_size;
@@ -371,7 +371,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- ZSTD_CStream *stream;
+ zstd_cstream *stream;
int ret = 0;
int nr_pages = 0;
struct page *in_page = NULL; /* The current page to read */
@@ -381,7 +381,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
*out_pages = 0;
@@ -389,10 +389,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
/* Initialize the stream */
- stream = ZSTD_initCStream(params, len, workspace->mem,
+ stream = zstd_init_cstream(&params, len, workspace->mem,
workspace->size);
if (!stream) {
- pr_warn("BTRFS: ZSTD_initCStream failed\n");
+ pr_warn("BTRFS: zstd_init_cstream failed\n");
ret = -EIO;
goto out;
}
@@ -418,11 +418,11 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
while (1) {
size_t ret2;
- ret2 = ZSTD_compressStream(stream, &workspace->out_buf,
+ ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_compress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto out;
}
@@ -487,10 +487,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
while (1) {
size_t ret2;
- ret2 = ZSTD_endStream(stream, &workspace->out_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_endStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ ret2 = zstd_end_stream(stream, &workspace->out_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_end_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto out;
}
@@ -548,17 +548,17 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
size_t srclen = cb->compressed_len;
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
int ret = 0;
unsigned long page_in_index = 0;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
- stream = ZSTD_initDStream(
+ stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
- pr_debug("BTRFS: ZSTD_initDStream failed\n");
+ pr_debug("BTRFS: zstd_init_dstream failed\n");
ret = -EIO;
goto done;
}
@@ -574,11 +574,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
while (1) {
size_t ret2;
- ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto done;
}
@@ -624,16 +624,16 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
int ret = 0;
size_t ret2;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- stream = ZSTD_initDStream(
+ stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
- pr_warn("BTRFS: ZSTD_initDStream failed\n");
+ pr_warn("BTRFS: zstd_init_dstream failed\n");
ret = -EIO;
goto finish;
}
@@ -657,15 +657,15 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
/* Check if the frame is over and we still need more input */
if (ret2 == 0) {
- pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
+ pr_debug("BTRFS: zstd_decompress_stream ended early\n");
ret = -EIO;
goto finish;
}
- ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto finish;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index c615387aedca..2b5561ae5d0b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -53,7 +53,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint hint, struct writeback_control *wbc);
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -613,17 +613,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*/
-int __set_page_dirty_buffers(struct page *page)
+bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- int newly_dirty;
- struct address_space *mapping = page_mapping(page);
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
+ struct buffer_head *head;
+ bool newly_dirty;
spin_lock(&mapping->private_lock);
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (head) {
struct buffer_head *bh = head;
do {
@@ -635,21 +632,21 @@ int __set_page_dirty_buffers(struct page *page)
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
- lock_page_memcg(page);
- newly_dirty = !TestSetPageDirty(page);
+ folio_memcg_lock(folio);
+ newly_dirty = !folio_test_set_dirty(folio);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __folio_mark_dirty(folio, mapping, 1);
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
-EXPORT_SYMBOL(__set_page_dirty_buffers);
+EXPORT_SYMBOL(block_dirty_folio);
/*
* Write out and wait upon a list of buffers.
@@ -878,7 +875,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
+ loff_t sz = bdev_nr_bytes(bdev);
if (sz) {
unsigned int sizebits = blksize_bits(size);
@@ -897,7 +894,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+ sector_t end_block = blkdev_max_block(bdev, size);
do {
if (!buffer_mapped(bh)) {
@@ -1235,16 +1232,18 @@ static void bh_lru_install(struct buffer_head *bh)
int i;
check_irqs_on();
+ bh_lru_lock();
+
/*
* the refcount of buffer_head in bh_lru prevents dropping the
* attached page(i.e., try_to_free_buffers) so it could cause
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
- if (lru_cache_disabled())
+ if (lru_cache_disabled()) {
+ bh_lru_unlock();
return;
-
- bh_lru_lock();
+ }
b = this_cpu_ptr(&bh_lrus);
for (i = 0; i < BH_LRU_SIZE; i++) {
@@ -1482,41 +1481,40 @@ static void discard_buffer(struct buffer_head * bh)
}
/**
- * block_invalidatepage - invalidate part or all of a buffer-backed page
- *
- * @page: the page which is affected
+ * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
+ * @folio: The folio which is affected.
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
- * block_invalidatepage() is called when all or part of the page has become
+ * block_invalidate_folio() is called when all or part of the folio has been
* invalidated by a truncate operation.
*
- * block_invalidatepage() does not have to release all buffers, but it must
+ * block_invalidate_folio() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
- unsigned int curr_off = 0;
- unsigned int stop = length + offset;
+ size_t curr_off = 0;
+ size_t stop = length + offset;
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
+ BUG_ON(!folio_test_locked(folio));
/*
* Check for overflow
*/
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
+
+ head = folio_buffers(folio);
+ if (!head)
+ return;
- head = page_buffers(page);
bh = head;
do {
- unsigned int next_off = curr_off + bh->b_size;
+ size_t next_off = curr_off + bh->b_size;
next = bh->b_this_page;
/*
@@ -1535,21 +1533,21 @@ void block_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
/*
- * We release buffers only if the entire page is being invalidated.
+ * We release buffers only if the entire folio is being invalidated.
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (length == PAGE_SIZE)
- try_to_release_page(page, 0);
+ if (length == folio_size(folio))
+ filemap_release_folio(folio, 0);
out:
return;
}
-EXPORT_SYMBOL(block_invalidatepage);
+EXPORT_SYMBOL(block_invalidate_folio);
/*
* We attach and possibly dirty the buffers atomically wrt
- * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
+ * block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the page lock.
*/
void create_empty_buffers(struct page *page,
@@ -1724,12 +1722,12 @@ int __block_write_full_page(struct inode *inode, struct page *page,
(1 << BH_Dirty)|(1 << BH_Uptodate));
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -1806,8 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1861,8 +1858,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1969,34 +1965,34 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
}
}
-int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
BUG_ON(from > PAGE_SIZE);
BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
- head = create_page_buffers(page, inode, 0);
+ head = create_page_buffers(&folio->page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- block = (sector_t)page->index << (PAGE_SHIFT - bbits);
+ block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
@@ -2016,20 +2012,20 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
- zero_user_segments(page,
+ folio_zero_segments(folio,
to, block_end,
block_start, from);
continue;
}
}
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
@@ -2050,14 +2046,15 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
err = -EIO;
}
if (unlikely(err))
- page_zero_new_buffers(page, from, to);
+ page_zero_new_buffers(&folio->page, from, to);
return err;
}
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
- return __block_write_begin_int(page, pos, len, get_block, NULL);
+ return __block_write_begin_int(page_folio(page), pos, len, get_block,
+ NULL);
}
EXPORT_SYMBOL(__block_write_begin);
@@ -2205,29 +2202,27 @@ int generic_write_end(struct file *file, struct address_space *mapping,
EXPORT_SYMBOL(generic_write_end);
/*
- * block_is_partially_uptodate checks whether buffers within a page are
+ * block_is_partially_uptodate checks whether buffers within a folio are
* uptodate or not.
*
- * Returns true if all buffers which correspond to a file portion
- * we want to read are uptodate.
+ * Returns true if all buffers which correspond to the specified part
+ * of the folio are uptodate.
*/
-int block_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
- int ret = 1;
-
- if (!page_has_buffers(page))
- return 0;
+ bool ret = true;
- head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (!head)
+ return false;
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_SIZE - from, count);
+ to = min_t(unsigned, folio_size(folio) - from, count);
to = from + to;
- if (from < blocksize && to > PAGE_SIZE - blocksize)
- return 0;
+ if (from < blocksize && to > folio_size(folio) - blocksize)
+ return false;
bh = head;
block_start = 0;
@@ -2235,7 +2230,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
block_end = block_start + blocksize;
if (block_end > from && block_start < to) {
if (!buffer_uptodate(bh)) {
- ret = 0;
+ ret = false;
break;
}
if (block_end >= to)
@@ -2357,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
if (err)
goto out;
- err = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_CONT_EXPAND, &page, &fsdata);
+ err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
if (err)
goto out;
@@ -3007,7 +3001,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint write_hint, struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct bio *bio;
@@ -3023,13 +3017,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- bio = bio_alloc(GFP_NOIO, 1);
+ if (buffer_meta(bh))
+ op_flags |= REQ_META;
+ if (buffer_prio(bh))
+ op_flags |= REQ_PRIO;
+
+ bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
- bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
@@ -3037,12 +3034,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- if (buffer_meta(bh))
- op_flags |= REQ_META;
- if (buffer_prio(bh))
- op_flags |= REQ_PRIO;
- bio_set_op_attrs(bio, op, op_flags);
-
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(bio);
@@ -3057,7 +3048,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3184,7 +3175,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*
* The same applies to regular filesystem pages: if all the buffers are
* clean then we set the page clean and proceed. To do that, we require
- * total exclusion from __set_page_dirty_buffers(). That is obtained with
+ * total exclusion from block_dirty_folio(). That is obtained with
* private_lock.
*
* try_to_free_buffers() is non-blocking.
@@ -3251,7 +3242,7 @@ int try_to_free_buffers(struct page *page)
* the page also.
*
* private_lock must be held over this entire operation in order
- * to synchronise against __set_page_dirty_buffers and prevent the
+ * to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 6827b40f7ddc..719faeeda168 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -19,3 +19,10 @@ config CACHEFILES_DEBUG
caching on files module. If this is set, the debugging output may be
enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_ERROR_INJECTION
+ bool "Provide error injection for cachefiles"
+ depends on CACHEFILES && SYSCTL
+ help
+ This permits error injection to be enabled in cachefiles whilst a
+ cache is in service.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 02fd17731769..16d811f1a2fa 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -4,15 +4,17 @@
#
cachefiles-y := \
- bind.o \
+ cache.o \
daemon.o \
interface.o \
io.o \
key.o \
main.o \
namei.o \
- rdwr.o \
security.o \
+ volume.o \
xattr.o
+cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+
obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
deleted file mode 100644
index d463d89f5db8..000000000000
--- a/fs/cachefiles/bind.c
+++ /dev/null
@@ -1,278 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Bind and unbind a cache from the filesystem backing it
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/statfs.h>
-#include <linux/ctype.h>
-#include <linux/xattr.h>
-#include "internal.h"
-
-static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
-
-/*
- * bind a directory as a cache
- */
-int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
-{
- _enter("{%u,%u,%u,%u,%u,%u},%s",
- cache->frun_percent,
- cache->fcull_percent,
- cache->fstop_percent,
- cache->brun_percent,
- cache->bcull_percent,
- cache->bstop_percent,
- args);
-
- /* start by checking things over */
- ASSERT(cache->fstop_percent >= 0 &&
- cache->fstop_percent < cache->fcull_percent &&
- cache->fcull_percent < cache->frun_percent &&
- cache->frun_percent < 100);
-
- ASSERT(cache->bstop_percent >= 0 &&
- cache->bstop_percent < cache->bcull_percent &&
- cache->bcull_percent < cache->brun_percent &&
- cache->brun_percent < 100);
-
- if (*args) {
- pr_err("'bind' command doesn't take an argument\n");
- return -EINVAL;
- }
-
- if (!cache->rootdirname) {
- pr_err("No cache directory specified\n");
- return -EINVAL;
- }
-
- /* don't permit already bound caches to be re-bound */
- if (test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("Cache already bound\n");
- return -EBUSY;
- }
-
- /* make sure we have copies of the tag and dirname strings */
- if (!cache->tag) {
- /* the tag string is released by the fops->release()
- * function, so we don't release it on error here */
- cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
- if (!cache->tag)
- return -ENOMEM;
- }
-
- /* add the cache */
- return cachefiles_daemon_add_cache(cache);
-}
-
-/*
- * add a cache
- */
-static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
-{
- struct cachefiles_object *fsdef;
- struct path path;
- struct kstatfs stats;
- struct dentry *graveyard, *cachedir, *root;
- const struct cred *saved_cred;
- int ret;
-
- _enter("");
-
- /* we want to work under the module's security ID */
- ret = cachefiles_get_security_ID(cache);
- if (ret < 0)
- return ret;
-
- cachefiles_begin_secure(cache, &saved_cred);
-
- /* allocate the root index object */
- ret = -ENOMEM;
-
- fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
- if (!fsdef)
- goto error_root_object;
-
- ASSERTCMP(fsdef->backer, ==, NULL);
-
- atomic_set(&fsdef->usage, 1);
- fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
-
- /* look up the directory at the root of the cache */
- ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
- if (ret < 0)
- goto error_open_root;
-
- cache->mnt = path.mnt;
- root = path.dentry;
-
- ret = -EINVAL;
- if (mnt_user_ns(path.mnt) != &init_user_ns) {
- pr_warn("File cache on idmapped mounts not supported");
- goto error_unsupported;
- }
-
- /* check parameters */
- ret = -EOPNOTSUPP;
- if (d_is_negative(root) ||
- !d_backing_inode(root)->i_op->lookup ||
- !d_backing_inode(root)->i_op->mkdir ||
- !(d_backing_inode(root)->i_opflags & IOP_XATTR) ||
- !root->d_sb->s_op->statfs ||
- !root->d_sb->s_op->sync_fs)
- goto error_unsupported;
-
- ret = -EROFS;
- if (sb_rdonly(root->d_sb))
- goto error_unsupported;
-
- /* determine the security of the on-disk cache as this governs
- * security ID of files we create */
- ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
- if (ret < 0)
- goto error_unsupported;
-
- /* get the cache size and blocksize */
- ret = vfs_statfs(&path, &stats);
- if (ret < 0)
- goto error_unsupported;
-
- ret = -ERANGE;
- if (stats.f_bsize <= 0)
- goto error_unsupported;
-
- ret = -EOPNOTSUPP;
- if (stats.f_bsize > PAGE_SIZE)
- goto error_unsupported;
-
- cache->bsize = stats.f_bsize;
- cache->bshift = 0;
- if (stats.f_bsize < PAGE_SIZE)
- cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
-
- _debug("blksize %u (shift %u)",
- cache->bsize, cache->bshift);
-
- _debug("size %llu, avail %llu",
- (unsigned long long) stats.f_blocks,
- (unsigned long long) stats.f_bavail);
-
- /* set up caching limits */
- do_div(stats.f_files, 100);
- cache->fstop = stats.f_files * cache->fstop_percent;
- cache->fcull = stats.f_files * cache->fcull_percent;
- cache->frun = stats.f_files * cache->frun_percent;
-
- _debug("limits {%llu,%llu,%llu} files",
- (unsigned long long) cache->frun,
- (unsigned long long) cache->fcull,
- (unsigned long long) cache->fstop);
-
- stats.f_blocks >>= cache->bshift;
- do_div(stats.f_blocks, 100);
- cache->bstop = stats.f_blocks * cache->bstop_percent;
- cache->bcull = stats.f_blocks * cache->bcull_percent;
- cache->brun = stats.f_blocks * cache->brun_percent;
-
- _debug("limits {%llu,%llu,%llu} blocks",
- (unsigned long long) cache->brun,
- (unsigned long long) cache->bcull,
- (unsigned long long) cache->bstop);
-
- /* get the cache directory and check its type */
- cachedir = cachefiles_get_directory(cache, root, "cache");
- if (IS_ERR(cachedir)) {
- ret = PTR_ERR(cachedir);
- goto error_unsupported;
- }
-
- fsdef->dentry = cachedir;
- fsdef->fscache.cookie = NULL;
-
- ret = cachefiles_check_object_type(fsdef);
- if (ret < 0)
- goto error_unsupported;
-
- /* get the graveyard directory */
- graveyard = cachefiles_get_directory(cache, root, "graveyard");
- if (IS_ERR(graveyard)) {
- ret = PTR_ERR(graveyard);
- goto error_unsupported;
- }
-
- cache->graveyard = graveyard;
-
- /* publish the cache */
- fscache_init_cache(&cache->cache,
- &cachefiles_cache_ops,
- "%s",
- fsdef->dentry->d_sb->s_id);
-
- fscache_object_init(&fsdef->fscache, &fscache_fsdef_index,
- &cache->cache);
-
- ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
- if (ret < 0)
- goto error_add_cache;
-
- /* done */
- set_bit(CACHEFILES_READY, &cache->flags);
- dput(root);
-
- pr_info("File cache on %s registered\n", cache->cache.identifier);
-
- /* check how much space the cache has */
- cachefiles_has_space(cache, 0, 0);
- cachefiles_end_secure(cache, saved_cred);
- return 0;
-
-error_add_cache:
- dput(cache->graveyard);
- cache->graveyard = NULL;
-error_unsupported:
- mntput(cache->mnt);
- cache->mnt = NULL;
- dput(fsdef->dentry);
- fsdef->dentry = NULL;
- dput(root);
-error_open_root:
- kmem_cache_free(cachefiles_object_jar, fsdef);
-error_root_object:
- cachefiles_end_secure(cache, saved_cred);
- pr_err("Failed to register: %d\n", ret);
- return ret;
-}
-
-/*
- * unbind a cache on fd release
- */
-void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
-{
- _enter("");
-
- if (test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_info("File cache on %s unregistering\n",
- cache->cache.identifier);
-
- fscache_withdraw_cache(&cache->cache);
- }
-
- dput(cache->graveyard);
- mntput(cache->mnt);
-
- kfree(cache->rootdirname);
- kfree(cache->secctx);
- kfree(cache->tag);
-
- _leave("");
-}
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
new file mode 100644
index 000000000000..7077f72e6f47
--- /dev/null
+++ b/fs/cachefiles/cache.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Manage high-level VFS aspects of a cache.
+ *
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+/*
+ * Bring a cache online.
+ */
+int cachefiles_add_cache(struct cachefiles_cache *cache)
+{
+ struct fscache_cache *cache_cookie;
+ struct path path;
+ struct kstatfs stats;
+ struct dentry *graveyard, *cachedir, *root;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("");
+
+ cache_cookie = fscache_acquire_cache(cache->tag);
+ if (IS_ERR(cache_cookie))
+ return PTR_ERR(cache_cookie);
+
+ /* we want to work under the module's security ID */
+ ret = cachefiles_get_security_ID(cache);
+ if (ret < 0)
+ goto error_getsec;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+
+ /* look up the directory at the root of the cache */
+ ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
+ if (ret < 0)
+ goto error_open_root;
+
+ cache->mnt = path.mnt;
+ root = path.dentry;
+
+ ret = -EINVAL;
+ if (is_idmapped_mnt(path.mnt)) {
+ pr_warn("File cache on idmapped mounts not supported");
+ goto error_unsupported;
+ }
+
+ /* Check features of the backing filesystem:
+ * - Directories must support looking up and directory creation
+ * - We create tmpfiles to handle invalidation
+ * - We use xattrs to store metadata
+ * - We need to be able to query the amount of space available
+ * - We want to be able to sync the filesystem when stopping the cache
+ * - We use DIO to/from pages, so the blocksize mustn't be too big.
+ */
+ ret = -EOPNOTSUPP;
+ if (d_is_negative(root) ||
+ !d_backing_inode(root)->i_op->lookup ||
+ !d_backing_inode(root)->i_op->mkdir ||
+ !d_backing_inode(root)->i_op->tmpfile ||
+ !(d_backing_inode(root)->i_opflags & IOP_XATTR) ||
+ !root->d_sb->s_op->statfs ||
+ !root->d_sb->s_op->sync_fs ||
+ root->d_sb->s_blocksize > PAGE_SIZE)
+ goto error_unsupported;
+
+ ret = -EROFS;
+ if (sb_rdonly(root->d_sb))
+ goto error_unsupported;
+
+ /* determine the security of the on-disk cache as this governs
+ * security ID of files we create */
+ ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+ if (ret < 0)
+ goto error_unsupported;
+
+ /* get the cache size and blocksize */
+ ret = vfs_statfs(&path, &stats);
+ if (ret < 0)
+ goto error_unsupported;
+
+ ret = -ERANGE;
+ if (stats.f_bsize <= 0)
+ goto error_unsupported;
+
+ ret = -EOPNOTSUPP;
+ if (stats.f_bsize > PAGE_SIZE)
+ goto error_unsupported;
+
+ cache->bsize = stats.f_bsize;
+ cache->bshift = ilog2(stats.f_bsize);
+
+ _debug("blksize %u (shift %u)",
+ cache->bsize, cache->bshift);
+
+ _debug("size %llu, avail %llu",
+ (unsigned long long) stats.f_blocks,
+ (unsigned long long) stats.f_bavail);
+
+ /* set up caching limits */
+ do_div(stats.f_files, 100);
+ cache->fstop = stats.f_files * cache->fstop_percent;
+ cache->fcull = stats.f_files * cache->fcull_percent;
+ cache->frun = stats.f_files * cache->frun_percent;
+
+ _debug("limits {%llu,%llu,%llu} files",
+ (unsigned long long) cache->frun,
+ (unsigned long long) cache->fcull,
+ (unsigned long long) cache->fstop);
+
+ do_div(stats.f_blocks, 100);
+ cache->bstop = stats.f_blocks * cache->bstop_percent;
+ cache->bcull = stats.f_blocks * cache->bcull_percent;
+ cache->brun = stats.f_blocks * cache->brun_percent;
+
+ _debug("limits {%llu,%llu,%llu} blocks",
+ (unsigned long long) cache->brun,
+ (unsigned long long) cache->bcull,
+ (unsigned long long) cache->bstop);
+
+ /* get the cache directory and check its type */
+ cachedir = cachefiles_get_directory(cache, root, "cache", NULL);
+ if (IS_ERR(cachedir)) {
+ ret = PTR_ERR(cachedir);
+ goto error_unsupported;
+ }
+
+ cache->store = cachedir;
+
+ /* get the graveyard directory */
+ graveyard = cachefiles_get_directory(cache, root, "graveyard", NULL);
+ if (IS_ERR(graveyard)) {
+ ret = PTR_ERR(graveyard);
+ goto error_unsupported;
+ }
+
+ cache->graveyard = graveyard;
+ cache->cache = cache_cookie;
+
+ ret = fscache_add_cache(cache_cookie, &cachefiles_cache_ops, cache);
+ if (ret < 0)
+ goto error_add_cache;
+
+ /* done */
+ set_bit(CACHEFILES_READY, &cache->flags);
+ dput(root);
+
+ pr_info("File cache on %s registered\n", cache_cookie->name);
+
+ /* check how much space the cache has */
+ cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
+ cachefiles_end_secure(cache, saved_cred);
+ _leave(" = 0 [%px]", cache->cache);
+ return 0;
+
+error_add_cache:
+ cachefiles_put_directory(cache->graveyard);
+ cache->graveyard = NULL;
+error_unsupported:
+ cachefiles_put_directory(cache->store);
+ cache->store = NULL;
+ mntput(cache->mnt);
+ cache->mnt = NULL;
+ dput(root);
+error_open_root:
+ cachefiles_end_secure(cache, saved_cred);
+error_getsec:
+ fscache_relinquish_cache(cache_cookie);
+ cache->cache = NULL;
+ pr_err("Failed to register: %d\n", ret);
+ return ret;
+}
+
+/*
+ * See if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+ unsigned fnr, unsigned bnr,
+ enum cachefiles_has_space_for reason)
+{
+ struct kstatfs stats;
+ u64 b_avail, b_writing;
+ int ret;
+
+ struct path path = {
+ .mnt = cache->mnt,
+ .dentry = cache->mnt->mnt_root,
+ };
+
+ //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+ // (unsigned long long) cache->frun,
+ // (unsigned long long) cache->fcull,
+ // (unsigned long long) cache->fstop,
+ // (unsigned long long) cache->brun,
+ // (unsigned long long) cache->bcull,
+ // (unsigned long long) cache->bstop,
+ // fnr, bnr);
+
+ /* find out how many pages of blockdev are available */
+ memset(&stats, 0, sizeof(stats));
+
+ ret = vfs_statfs(&path, &stats);
+ if (ret < 0) {
+ trace_cachefiles_vfs_error(NULL, d_inode(path.dentry), ret,
+ cachefiles_trace_statfs_error);
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "statfs failed");
+ _leave(" = %d", ret);
+ return ret;
+ }
+
+ b_avail = stats.f_bavail;
+ b_writing = atomic_long_read(&cache->b_writing);
+ if (b_avail > b_writing)
+ b_avail -= b_writing;
+ else
+ b_avail = 0;
+
+ //_debug("avail %llu,%llu",
+ // (unsigned long long)stats.f_ffree,
+ // (unsigned long long)b_avail);
+
+ /* see if there is sufficient space */
+ if (stats.f_ffree > fnr)
+ stats.f_ffree -= fnr;
+ else
+ stats.f_ffree = 0;
+
+ if (b_avail > bnr)
+ b_avail -= bnr;
+ else
+ b_avail = 0;
+
+ ret = -ENOBUFS;
+ if (stats.f_ffree < cache->fstop ||
+ b_avail < cache->bstop)
+ goto stop_and_begin_cull;
+
+ ret = 0;
+ if (stats.f_ffree < cache->fcull ||
+ b_avail < cache->bcull)
+ goto begin_cull;
+
+ if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+ stats.f_ffree >= cache->frun &&
+ b_avail >= cache->brun &&
+ test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+ ) {
+ _debug("cease culling");
+ cachefiles_state_changed(cache);
+ }
+
+ //_leave(" = 0");
+ return 0;
+
+stop_and_begin_cull:
+ switch (reason) {
+ case cachefiles_has_space_for_write:
+ fscache_count_no_write_space();
+ break;
+ case cachefiles_has_space_for_create:
+ fscache_count_no_create_space();
+ break;
+ default:
+ break;
+ }
+begin_cull:
+ if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+ _debug("### CULL CACHE ###");
+ cachefiles_state_changed(cache);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Mark all the objects as being out of service and queue them all for cleanup.
+ */
+static void cachefiles_withdraw_objects(struct cachefiles_cache *cache)
+{
+ struct cachefiles_object *object;
+ unsigned int count = 0;
+
+ _enter("");
+
+ spin_lock(&cache->object_list_lock);
+
+ while (!list_empty(&cache->object_list)) {
+ object = list_first_entry(&cache->object_list,
+ struct cachefiles_object, cache_link);
+ cachefiles_see_object(object, cachefiles_obj_see_withdrawal);
+ list_del_init(&object->cache_link);
+ fscache_withdraw_cookie(object->cookie);
+ count++;
+ if ((count & 63) == 0) {
+ spin_unlock(&cache->object_list_lock);
+ cond_resched();
+ spin_lock(&cache->object_list_lock);
+ }
+ }
+
+ spin_unlock(&cache->object_list_lock);
+ _leave(" [%u objs]", count);
+}
+
+/*
+ * Withdraw volumes.
+ */
+static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
+{
+ _enter("");
+
+ for (;;) {
+ struct cachefiles_volume *volume = NULL;
+
+ spin_lock(&cache->object_list_lock);
+ if (!list_empty(&cache->volumes)) {
+ volume = list_first_entry(&cache->volumes,
+ struct cachefiles_volume, cache_link);
+ list_del_init(&volume->cache_link);
+ }
+ spin_unlock(&cache->object_list_lock);
+ if (!volume)
+ break;
+
+ cachefiles_withdraw_volume(volume);
+ }
+
+ _leave("");
+}
+
+/*
+ * Sync a cache to backing disk.
+ */
+static void cachefiles_sync_cache(struct cachefiles_cache *cache)
+{
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("%s", cache->cache->name);
+
+ /* make sure all pages pinned by operations on behalf of the netfs are
+ * written to disc */
+ cachefiles_begin_secure(cache, &saved_cred);
+ down_read(&cache->mnt->mnt_sb->s_umount);
+ ret = sync_filesystem(cache->mnt->mnt_sb);
+ up_read(&cache->mnt->mnt_sb->s_umount);
+ cachefiles_end_secure(cache, saved_cred);
+
+ if (ret == -EIO)
+ cachefiles_io_error(cache,
+ "Attempt to sync backing fs superblock returned error %d",
+ ret);
+}
+
+/*
+ * Withdraw cache objects.
+ */
+void cachefiles_withdraw_cache(struct cachefiles_cache *cache)
+{
+ struct fscache_cache *fscache = cache->cache;
+
+ pr_info("File cache on %s unregistering\n", fscache->name);
+
+ fscache_withdraw_cache(fscache);
+
+ /* we now have to destroy all the active objects pertaining to this
+ * cache - which we do by passing them off to thread pool to be
+ * disposed of */
+ cachefiles_withdraw_objects(cache);
+ fscache_wait_for_objects(fscache);
+
+ cachefiles_withdraw_volumes(cache);
+ cachefiles_sync_cache(cache);
+ cache->cache = NULL;
+ fscache_relinquish_cache(fscache);
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 752c1e43416f..7ac04ee2c0a0 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Daemon interface
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
@@ -41,6 +41,8 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bind(struct cachefiles_cache *, char *);
+static void cachefiles_daemon_unbind(struct cachefiles_cache *);
static unsigned long cachefiles_open;
@@ -78,7 +80,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
/*
- * do various checks
+ * Prepare a cache for caching.
*/
static int cachefiles_daemon_open(struct inode *inode, struct file *file)
{
@@ -102,9 +104,10 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
}
mutex_init(&cache->daemon_mutex);
- cache->active_nodes = RB_ROOT;
- rwlock_init(&cache->active_lock);
init_waitqueue_head(&cache->daemon_pollwq);
+ INIT_LIST_HEAD(&cache->volumes);
+ INIT_LIST_HEAD(&cache->object_list);
+ spin_lock_init(&cache->object_list_lock);
/* set default caching limits
* - limit at 1% free space and/or free files
@@ -124,7 +127,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
}
/*
- * release a cache
+ * Release a cache.
*/
static int cachefiles_daemon_release(struct inode *inode, struct file *file)
{
@@ -138,8 +141,6 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
cachefiles_daemon_unbind(cache);
- ASSERT(!cache->active_nodes.rb_node);
-
/* clean up the control file interface */
cache->cachefilesd = NULL;
file->private_data = NULL;
@@ -152,7 +153,7 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
}
/*
- * read the cache state
+ * Read the cache state.
*/
static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
size_t buflen, loff_t *pos)
@@ -169,7 +170,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
return 0;
/* check how much space the cache has */
- cachefiles_has_space(cache, 0, 0);
+ cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
/* summarise */
f_released = atomic_xchg(&cache->f_released, 0);
@@ -206,7 +207,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
}
/*
- * command the cache
+ * Take a command from cachefilesd, parse it and act on it.
*/
static ssize_t cachefiles_daemon_write(struct file *file,
const char __user *_data,
@@ -225,7 +226,7 @@ static ssize_t cachefiles_daemon_write(struct file *file,
if (test_bit(CACHEFILES_DEAD, &cache->flags))
return -EIO;
- if (datalen < 0 || datalen > PAGE_SIZE - 1)
+ if (datalen > PAGE_SIZE - 1)
return -EOPNOTSUPP;
/* drag the command string into the kernel so we can parse it */
@@ -284,7 +285,7 @@ found_command:
}
/*
- * poll for culling state
+ * Poll for culling state
* - use EPOLLOUT to indicate culling state
*/
static __poll_t cachefiles_daemon_poll(struct file *file,
@@ -306,7 +307,7 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
}
/*
- * give a range error for cache space constraints
+ * Give a range error for cache space constraints
* - can be tail-called
*/
static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
@@ -318,7 +319,7 @@ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
}
/*
- * set the percentage of files at which to stop culling
+ * Set the percentage of files at which to stop culling
* - command: "frun <N>%"
*/
static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
@@ -342,7 +343,7 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
}
/*
- * set the percentage of files at which to start culling
+ * Set the percentage of files at which to start culling
* - command: "fcull <N>%"
*/
static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
@@ -366,7 +367,7 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
}
/*
- * set the percentage of files at which to stop allocating
+ * Set the percentage of files at which to stop allocating
* - command: "fstop <N>%"
*/
static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
@@ -382,7 +383,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
- if (fstop < 0 || fstop >= cache->fcull_percent)
+ if (fstop >= cache->fcull_percent)
return cachefiles_daemon_range_error(cache, args);
cache->fstop_percent = fstop;
@@ -390,7 +391,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
}
/*
- * set the percentage of blocks at which to stop culling
+ * Set the percentage of blocks at which to stop culling
* - command: "brun <N>%"
*/
static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
@@ -414,7 +415,7 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
}
/*
- * set the percentage of blocks at which to start culling
+ * Set the percentage of blocks at which to start culling
* - command: "bcull <N>%"
*/
static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
@@ -438,7 +439,7 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
}
/*
- * set the percentage of blocks at which to stop allocating
+ * Set the percentage of blocks at which to stop allocating
* - command: "bstop <N>%"
*/
static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
@@ -454,7 +455,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
- if (bstop < 0 || bstop >= cache->bcull_percent)
+ if (bstop >= cache->bcull_percent)
return cachefiles_daemon_range_error(cache, args);
cache->bstop_percent = bstop;
@@ -462,7 +463,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
}
/*
- * set the cache directory
+ * Set the cache directory
* - command: "dir <name>"
*/
static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
@@ -490,7 +491,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
}
/*
- * set the cache security context
+ * Set the cache security context
* - command: "secctx <ctx>"
*/
static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
@@ -518,7 +519,7 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
}
/*
- * set the cache tag
+ * Set the cache tag
* - command: "tag <name>"
*/
static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
@@ -544,7 +545,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
}
/*
- * request a node in the cache be culled from the current working directory
+ * Request a node in the cache be culled from the current working directory
* - command: "cull <name>"
*/
static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
@@ -568,7 +569,6 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
return -EIO;
}
- /* extract the directory dentry from the cwd */
get_fs_pwd(current->fs, &path);
if (!d_can_lookup(path.dentry))
@@ -593,7 +593,7 @@ inval:
}
/*
- * set debugging mode
+ * Set debugging mode
* - command: "debug <mask>"
*/
static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
@@ -616,7 +616,7 @@ inval:
}
/*
- * find out whether an object in the current working directory is in use or not
+ * Find out whether an object in the current working directory is in use or not
* - command: "inuse <name>"
*/
static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
@@ -640,7 +640,6 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
return -EIO;
}
- /* extract the directory dentry from the cwd */
get_fs_pwd(current->fs, &path);
if (!d_can_lookup(path.dentry))
@@ -665,84 +664,76 @@ inval:
}
/*
- * see if we have space for a number of pages and/or a number of files in the
- * cache
+ * Bind a directory as a cache
*/
-int cachefiles_has_space(struct cachefiles_cache *cache,
- unsigned fnr, unsigned bnr)
+static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
{
- struct kstatfs stats;
- struct path path = {
- .mnt = cache->mnt,
- .dentry = cache->mnt->mnt_root,
- };
- int ret;
+ _enter("{%u,%u,%u,%u,%u,%u},%s",
+ cache->frun_percent,
+ cache->fcull_percent,
+ cache->fstop_percent,
+ cache->brun_percent,
+ cache->bcull_percent,
+ cache->bstop_percent,
+ args);
+
+ if (cache->fstop_percent >= cache->fcull_percent ||
+ cache->fcull_percent >= cache->frun_percent ||
+ cache->frun_percent >= 100)
+ return -ERANGE;
+
+ if (cache->bstop_percent >= cache->bcull_percent ||
+ cache->bcull_percent >= cache->brun_percent ||
+ cache->brun_percent >= 100)
+ return -ERANGE;
- //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
- // (unsigned long long) cache->frun,
- // (unsigned long long) cache->fcull,
- // (unsigned long long) cache->fstop,
- // (unsigned long long) cache->brun,
- // (unsigned long long) cache->bcull,
- // (unsigned long long) cache->bstop,
- // fnr, bnr);
-
- /* find out how many pages of blockdev are available */
- memset(&stats, 0, sizeof(stats));
-
- ret = vfs_statfs(&path, &stats);
- if (ret < 0) {
- if (ret == -EIO)
- cachefiles_io_error(cache, "statfs failed");
- _leave(" = %d", ret);
- return ret;
+ if (*args) {
+ pr_err("'bind' command doesn't take an argument\n");
+ return -EINVAL;
}
- stats.f_bavail >>= cache->bshift;
-
- //_debug("avail %llu,%llu",
- // (unsigned long long) stats.f_ffree,
- // (unsigned long long) stats.f_bavail);
-
- /* see if there is sufficient space */
- if (stats.f_ffree > fnr)
- stats.f_ffree -= fnr;
- else
- stats.f_ffree = 0;
-
- if (stats.f_bavail > bnr)
- stats.f_bavail -= bnr;
- else
- stats.f_bavail = 0;
-
- ret = -ENOBUFS;
- if (stats.f_ffree < cache->fstop ||
- stats.f_bavail < cache->bstop)
- goto begin_cull;
-
- ret = 0;
- if (stats.f_ffree < cache->fcull ||
- stats.f_bavail < cache->bcull)
- goto begin_cull;
-
- if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
- stats.f_ffree >= cache->frun &&
- stats.f_bavail >= cache->brun &&
- test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
- ) {
- _debug("cease culling");
- cachefiles_state_changed(cache);
+ if (!cache->rootdirname) {
+ pr_err("No cache directory specified\n");
+ return -EINVAL;
}
- //_leave(" = 0");
- return 0;
+ /* Don't permit already bound caches to be re-bound */
+ if (test_bit(CACHEFILES_READY, &cache->flags)) {
+ pr_err("Cache already bound\n");
+ return -EBUSY;
+ }
-begin_cull:
- if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
- _debug("### CULL CACHE ###");
- cachefiles_state_changed(cache);
+ /* Make sure we have copies of the tag string */
+ if (!cache->tag) {
+ /*
+ * The tag string is released by the fops->release()
+ * function, so we don't release it on error here
+ */
+ cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+ if (!cache->tag)
+ return -ENOMEM;
}
- _leave(" = %d", ret);
- return ret;
+ return cachefiles_add_cache(cache);
+}
+
+/*
+ * Unbind a cache.
+ */
+static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+ _enter("");
+
+ if (test_bit(CACHEFILES_READY, &cache->flags))
+ cachefiles_withdraw_cache(cache);
+
+ cachefiles_put_directory(cache->graveyard);
+ cachefiles_put_directory(cache->store);
+ mntput(cache->mnt);
+
+ kfree(cache->rootdirname);
+ kfree(cache->secctx);
+ kfree(cache->tag);
+
+ _leave("");
}
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
new file mode 100644
index 000000000000..58f8aec964e4
--- /dev/null
+++ b/fs/cachefiles/error_inject.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Error injection handling.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/sysctl.h>
+#include "internal.h"
+
+unsigned int cachefiles_error_injection_state;
+
+static struct ctl_table_header *cachefiles_sysctl;
+static struct ctl_table cachefiles_sysctls[] = {
+ {
+ .procname = "error_injection",
+ .data = &cachefiles_error_injection_state,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {}
+};
+
+static struct ctl_table cachefiles_sysctls_root[] = {
+ {
+ .procname = "cachefiles",
+ .mode = 0555,
+ .child = cachefiles_sysctls,
+ },
+ {}
+};
+
+int __init cachefiles_register_error_injection(void)
+{
+ cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root);
+ if (!cachefiles_sysctl)
+ return -ENOMEM;
+ return 0;
+
+}
+
+void cachefiles_unregister_error_injection(void)
+{
+ unregister_sysctl_table(cachefiles_sysctl);
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index da28ac1fa225..ae93cee9d25d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -1,572 +1,445 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* FS-Cache interface to CacheFiles
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/slab.h>
#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/file.h>
+#include <linux/falloc.h>
+#include <trace/events/fscache.h>
#include "internal.h"
-struct cachefiles_lookup_data {
- struct cachefiles_xattr *auxdata; /* auxiliary data */
- char *key; /* key path */
-};
-
-static int cachefiles_attr_changed(struct fscache_object *_object);
+static atomic_t cachefiles_object_debug_id;
/*
- * allocate an object record for a cookie lookup and prepare the lookup data
+ * Allocate a cache object record.
*/
-static struct fscache_object *cachefiles_alloc_object(
- struct fscache_cache *_cache,
- struct fscache_cookie *cookie)
+static
+struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie)
{
- struct cachefiles_lookup_data *lookup_data;
+ struct fscache_volume *vcookie = cookie->volume;
+ struct cachefiles_volume *volume = vcookie->cache_priv;
struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct cachefiles_xattr *auxdata;
- unsigned keylen, auxlen;
- void *buffer, *p;
- char *key;
- cache = container_of(_cache, struct cachefiles_cache, cache);
+ _enter("{%s},%x,", vcookie->key, cookie->debug_id);
- _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id);
-
- lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
- if (!lookup_data)
- goto nomem_lookup_data;
-
- /* create a new object record and a temporary leaf image */
- object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
+ object = kmem_cache_zalloc(cachefiles_object_jar, GFP_KERNEL);
if (!object)
- goto nomem_object;
-
- ASSERTCMP(object->backer, ==, NULL);
+ return NULL;
- BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
- atomic_set(&object->usage, 1);
+ refcount_set(&object->ref, 1);
- fscache_object_init(&object->fscache, cookie, &cache->cache);
+ spin_lock_init(&object->lock);
+ INIT_LIST_HEAD(&object->cache_link);
+ object->volume = volume;
+ object->debug_id = atomic_inc_return(&cachefiles_object_debug_id);
+ object->cookie = fscache_get_cookie(cookie, fscache_cookie_get_attach_object);
- object->type = cookie->def->type;
-
- /* get hold of the raw key
- * - stick the length on the front and leave space on the back for the
- * encoder
- */
- buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
- if (!buffer)
- goto nomem_buffer;
-
- keylen = cookie->key_len;
- if (keylen <= sizeof(cookie->inline_key))
- p = cookie->inline_key;
- else
- p = cookie->key;
- memcpy(buffer + 2, p, keylen);
-
- *(uint16_t *)buffer = keylen;
- ((char *)buffer)[keylen + 2] = 0;
- ((char *)buffer)[keylen + 3] = 0;
- ((char *)buffer)[keylen + 4] = 0;
-
- /* turn the raw key into something that can work with as a filename */
- key = cachefiles_cook_key(buffer, keylen + 2, object->type);
- if (!key)
- goto nomem_key;
-
- /* get hold of the auxiliary data and prepend the object type */
- auxdata = buffer;
- auxlen = cookie->aux_len;
- if (auxlen) {
- if (auxlen <= sizeof(cookie->inline_aux))
- p = cookie->inline_aux;
- else
- p = cookie->aux;
- memcpy(auxdata->data, p, auxlen);
- }
-
- auxdata->len = auxlen + 1;
- auxdata->type = cookie->type;
-
- lookup_data->auxdata = auxdata;
- lookup_data->key = key;
- object->lookup_data = lookup_data;
-
- _leave(" = %x [%p]", object->fscache.debug_id, lookup_data);
- return &object->fscache;
-
-nomem_key:
- kfree(buffer);
-nomem_buffer:
- BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
- kmem_cache_free(cachefiles_object_jar, object);
- fscache_object_destroyed(&cache->cache);
-nomem_object:
- kfree(lookup_data);
-nomem_lookup_data:
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
+ fscache_count_object(vcookie->cache);
+ trace_cachefiles_ref(object->debug_id, cookie->debug_id, 1,
+ cachefiles_obj_new);
+ return object;
}
/*
- * attempt to look up the nominated node in this cache
- * - return -ETIMEDOUT to be scheduled again
+ * Note that an object has been seen.
*/
-static int cachefiles_lookup_object(struct fscache_object *_object)
+void cachefiles_see_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why)
{
- struct cachefiles_lookup_data *lookup_data;
- struct cachefiles_object *parent, *object;
- struct cachefiles_cache *cache;
- const struct cred *saved_cred;
- int ret;
-
- _enter("{OBJ%x}", _object->debug_id);
-
- cache = container_of(_object->cache, struct cachefiles_cache, cache);
- parent = container_of(_object->parent,
- struct cachefiles_object, fscache);
- object = container_of(_object, struct cachefiles_object, fscache);
- lookup_data = object->lookup_data;
-
- ASSERTCMP(lookup_data, !=, NULL);
-
- /* look up the key, creating any missing bits */
- cachefiles_begin_secure(cache, &saved_cred);
- ret = cachefiles_walk_to_object(parent, object,
- lookup_data->key,
- lookup_data->auxdata);
- cachefiles_end_secure(cache, saved_cred);
-
- /* polish off by setting the attributes of non-index files */
- if (ret == 0 &&
- object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
- cachefiles_attr_changed(&object->fscache);
-
- if (ret < 0 && ret != -ETIMEDOUT) {
- if (ret != -ENOBUFS)
- pr_warn("Lookup failed error %d\n", ret);
- fscache_object_lookup_error(&object->fscache);
- }
-
- _leave(" [%d]", ret);
- return ret;
+ trace_cachefiles_ref(object->debug_id, object->cookie->debug_id,
+ refcount_read(&object->ref), why);
}
/*
- * indication of lookup completion
+ * Increment the usage count on an object;
*/
-static void cachefiles_lookup_complete(struct fscache_object *_object)
+struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why)
{
- struct cachefiles_object *object;
-
- object = container_of(_object, struct cachefiles_object, fscache);
-
- _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+ int r;
- if (object->lookup_data) {
- kfree(object->lookup_data->key);
- kfree(object->lookup_data->auxdata);
- kfree(object->lookup_data);
- object->lookup_data = NULL;
- }
+ __refcount_inc(&object->ref, &r);
+ trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, r, why);
+ return object;
}
/*
- * increment the usage count on an inode object (may fail if unmounting)
+ * dispose of a reference to an object
*/
-static
-struct fscache_object *cachefiles_grab_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why)
{
- struct cachefiles_object *object =
- container_of(_object, struct cachefiles_object, fscache);
- int u;
+ unsigned int object_debug_id = object->debug_id;
+ unsigned int cookie_debug_id = object->cookie->debug_id;
+ struct fscache_cache *cache;
+ bool done;
+ int r;
+
+ done = __refcount_dec_and_test(&object->ref, &r);
+ trace_cachefiles_ref(object_debug_id, cookie_debug_id, r, why);
+ if (done) {
+ _debug("- kill object OBJ%x", object_debug_id);
+
+ ASSERTCMP(object->file, ==, NULL);
- _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+ kfree(object->d_name);
-#ifdef CACHEFILES_DEBUG_SLAB
- ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
+ cache = object->volume->cache->cache;
+ fscache_put_cookie(object->cookie, fscache_cookie_put_object);
+ object->cookie = NULL;
+ kmem_cache_free(cachefiles_object_jar, object);
+ fscache_uncount_object(cache);
+ }
- u = atomic_inc_return(&object->usage);
- trace_cachefiles_ref(object, _object->cookie,
- (enum cachefiles_obj_ref_trace)why, u);
- return &object->fscache;
+ _leave("");
}
/*
- * update the auxiliary data for an object object on disk
+ * Adjust the size of a cache file if necessary to match the DIO size. We keep
+ * the EOF marker a multiple of DIO blocks so that we don't fall back to doing
+ * non-DIO for a partial block straddling the EOF, but we also have to be
+ * careful of someone expanding the file and accidentally accreting the
+ * padding.
*/
-static void cachefiles_update_object(struct fscache_object *_object)
+static int cachefiles_adjust_size(struct cachefiles_object *object)
{
- struct cachefiles_object *object;
- struct cachefiles_xattr *auxdata;
- struct cachefiles_cache *cache;
- struct fscache_cookie *cookie;
- const struct cred *saved_cred;
- const void *aux;
- unsigned auxlen;
+ struct iattr newattrs;
+ struct file *file = object->file;
+ uint64_t ni_size;
+ loff_t oi_size;
+ int ret;
- _enter("{OBJ%x}", _object->debug_id);
+ ni_size = object->cookie->object_size;
+ ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
- object = container_of(_object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache, struct cachefiles_cache,
- cache);
+ _enter("{OBJ%x},[%llu]",
+ object->debug_id, (unsigned long long) ni_size);
- if (!fscache_use_cookie(_object)) {
- _leave(" [relinq]");
- return;
- }
+ if (!file)
+ return -ENOBUFS;
- cookie = object->fscache.cookie;
- auxlen = cookie->aux_len;
+ oi_size = i_size_read(file_inode(file));
+ if (oi_size == ni_size)
+ return 0;
- if (!auxlen) {
- fscache_unuse_cookie(_object);
- _leave(" [no aux]");
- return;
- }
+ inode_lock(file_inode(file));
- auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp);
- if (!auxdata) {
- fscache_unuse_cookie(_object);
- _leave(" [nomem]");
- return;
+ /* if there's an extension to a partial page at the end of the backing
+ * file, we need to discard the partial page so that we pick up new
+ * data after it */
+ if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+ _debug("discard tail %llx", oi_size);
+ newattrs.ia_valid = ATTR_SIZE;
+ newattrs.ia_size = oi_size & PAGE_MASK;
+ ret = cachefiles_inject_remove_error();
+ if (ret == 0)
+ ret = notify_change(&init_user_ns, file->f_path.dentry,
+ &newattrs, NULL);
+ if (ret < 0)
+ goto truncate_failed;
}
- aux = (auxlen <= sizeof(cookie->inline_aux)) ?
- cookie->inline_aux : cookie->aux;
+ newattrs.ia_valid = ATTR_SIZE;
+ newattrs.ia_size = ni_size;
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = notify_change(&init_user_ns, file->f_path.dentry,
+ &newattrs, NULL);
- memcpy(auxdata->data, aux, auxlen);
- fscache_unuse_cookie(_object);
+truncate_failed:
+ inode_unlock(file_inode(file));
- auxdata->len = auxlen + 1;
- auxdata->type = cookie->type;
+ if (ret < 0)
+ trace_cachefiles_io_error(NULL, file_inode(file), ret,
+ cachefiles_trace_notify_change_error);
+ if (ret == -EIO) {
+ cachefiles_io_error_obj(object, "Size set failed");
+ ret = -ENOBUFS;
+ }
- cachefiles_begin_secure(cache, &saved_cred);
- cachefiles_update_object_xattr(object, auxdata);
- cachefiles_end_secure(cache, saved_cred);
- kfree(auxdata);
- _leave("");
+ _leave(" = %d", ret);
+ return ret;
}
/*
- * discard the resources pinned by an object and effect retirement if
- * requested
+ * Attempt to look up the nominated node in this cache
*/
-static void cachefiles_drop_object(struct fscache_object *_object)
+static bool cachefiles_lookup_cookie(struct fscache_cookie *cookie)
{
struct cachefiles_object *object;
- struct cachefiles_cache *cache;
+ struct cachefiles_cache *cache = cookie->volume->cache->cache_priv;
const struct cred *saved_cred;
- struct inode *inode;
- blkcnt_t i_blocks = 0;
+ bool success;
- ASSERT(_object);
+ object = cachefiles_alloc_object(cookie);
+ if (!object)
+ goto fail;
- object = container_of(_object, struct cachefiles_object, fscache);
+ _enter("{OBJ%x}", object->debug_id);
- _enter("{OBJ%x,%d}",
- object->fscache.debug_id, atomic_read(&object->usage));
+ if (!cachefiles_cook_key(object))
+ goto fail_put;
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
+ cookie->cache_priv = object;
-#ifdef CACHEFILES_DEBUG_SLAB
- ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
+ cachefiles_begin_secure(cache, &saved_cred);
- /* We need to tidy the object up if we did in fact manage to open it.
- * It's possible for us to get here before the object is fully
- * initialised if the parent goes away or the object gets retired
- * before we set it up.
- */
- if (object->dentry) {
- /* delete retired objects */
- if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
- _object != cache->cache.fsdef
- ) {
- _debug("- retire object OBJ%x", object->fscache.debug_id);
- inode = d_backing_inode(object->dentry);
- if (inode)
- i_blocks = inode->i_blocks;
-
- cachefiles_begin_secure(cache, &saved_cred);
- cachefiles_delete_object(cache, object);
- cachefiles_end_secure(cache, saved_cred);
- }
+ success = cachefiles_look_up_object(object);
+ if (!success)
+ goto fail_withdraw;
- /* close the filesystem stuff attached to the object */
- if (object->backer != object->dentry)
- dput(object->backer);
- object->backer = NULL;
- }
+ cachefiles_see_object(object, cachefiles_obj_see_lookup_cookie);
+
+ spin_lock(&cache->object_list_lock);
+ list_add(&object->cache_link, &cache->object_list);
+ spin_unlock(&cache->object_list_lock);
+ cachefiles_adjust_size(object);
- /* note that the object is now inactive */
- if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
- cachefiles_mark_object_inactive(cache, object, i_blocks);
+ cachefiles_end_secure(cache, saved_cred);
+ _leave(" = t");
+ return true;
- dput(object->dentry);
- object->dentry = NULL;
+fail_withdraw:
+ cachefiles_end_secure(cache, saved_cred);
+ cachefiles_see_object(object, cachefiles_obj_see_lookup_failed);
+ fscache_caching_failed(cookie);
+ _debug("failed c=%08x o=%08x", cookie->debug_id, object->debug_id);
+ /* The caller holds an access count on the cookie, so we need them to
+ * drop it before we can withdraw the object.
+ */
+ return false;
- _leave("");
+fail_put:
+ cachefiles_put_object(object, cachefiles_obj_put_alloc_fail);
+fail:
+ return false;
}
/*
- * dispose of a reference to an object
+ * Shorten the backing object to discard any dirty data and free up
+ * any unused granules.
*/
-void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+static bool cachefiles_shorten_object(struct cachefiles_object *object,
+ struct file *file, loff_t new_size)
{
- struct cachefiles_object *object;
- struct fscache_cache *cache;
- int u;
-
- ASSERT(_object);
-
- object = container_of(_object, struct cachefiles_object, fscache);
-
- _enter("{OBJ%x,%d}",
- object->fscache.debug_id, atomic_read(&object->usage));
-
-#ifdef CACHEFILES_DEBUG_SLAB
- ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
-
- ASSERTIFCMP(object->fscache.parent,
- object->fscache.parent->n_children, >, 0);
-
- u = atomic_dec_return(&object->usage);
- trace_cachefiles_ref(object, _object->cookie,
- (enum cachefiles_obj_ref_trace)why, u);
- ASSERTCMP(u, !=, -1);
- if (u == 0) {
- _debug("- kill object OBJ%x", object->fscache.debug_id);
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct inode *inode = file_inode(file);
+ loff_t i_size, dio_size;
+ int ret;
- ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
- ASSERTCMP(object->fscache.parent, ==, NULL);
- ASSERTCMP(object->backer, ==, NULL);
- ASSERTCMP(object->dentry, ==, NULL);
- ASSERTCMP(object->fscache.n_ops, ==, 0);
- ASSERTCMP(object->fscache.n_children, ==, 0);
+ dio_size = round_up(new_size, CACHEFILES_DIO_BLOCK_SIZE);
+ i_size = i_size_read(inode);
+
+ trace_cachefiles_trunc(object, inode, i_size, dio_size,
+ cachefiles_trunc_shrink);
+ ret = cachefiles_inject_remove_error();
+ if (ret == 0)
+ ret = vfs_truncate(&file->f_path, dio_size);
+ if (ret < 0) {
+ trace_cachefiles_io_error(object, file_inode(file), ret,
+ cachefiles_trace_trunc_error);
+ cachefiles_io_error_obj(object, "Trunc-to-size failed %d", ret);
+ cachefiles_remove_object_xattr(cache, object, file->f_path.dentry);
+ return false;
+ }
- if (object->lookup_data) {
- kfree(object->lookup_data->key);
- kfree(object->lookup_data->auxdata);
- kfree(object->lookup_data);
- object->lookup_data = NULL;
+ if (new_size < dio_size) {
+ trace_cachefiles_trunc(object, inode, dio_size, new_size,
+ cachefiles_trunc_dio_adjust);
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE,
+ new_size, dio_size - new_size);
+ if (ret < 0) {
+ trace_cachefiles_io_error(object, file_inode(file), ret,
+ cachefiles_trace_fallocate_error);
+ cachefiles_io_error_obj(object, "Trunc-to-dio-size failed %d", ret);
+ cachefiles_remove_object_xattr(cache, object, file->f_path.dentry);
+ return false;
}
-
- cache = object->fscache.cache;
- fscache_object_destroy(&object->fscache);
- kmem_cache_free(cachefiles_object_jar, object);
- fscache_object_destroyed(cache);
}
- _leave("");
+ return true;
}
/*
- * sync a cache
+ * Resize the backing object.
*/
-static void cachefiles_sync_cache(struct fscache_cache *_cache)
+static void cachefiles_resize_cookie(struct netfs_cache_resources *cres,
+ loff_t new_size)
{
- struct cachefiles_cache *cache;
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct fscache_cookie *cookie = object->cookie;
const struct cred *saved_cred;
- int ret;
+ struct file *file = cachefiles_cres_file(cres);
+ loff_t old_size = cookie->object_size;
- _enter("%s", _cache->tag->name);
+ _enter("%llu->%llu", old_size, new_size);
- cache = container_of(_cache, struct cachefiles_cache, cache);
-
- /* make sure all pages pinned by operations on behalf of the netfs are
- * written to disc */
- cachefiles_begin_secure(cache, &saved_cred);
- down_read(&cache->mnt->mnt_sb->s_umount);
- ret = sync_filesystem(cache->mnt->mnt_sb);
- up_read(&cache->mnt->mnt_sb->s_umount);
- cachefiles_end_secure(cache, saved_cred);
+ if (new_size < old_size) {
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_shorten_object(object, file, new_size);
+ cachefiles_end_secure(cache, saved_cred);
+ object->cookie->object_size = new_size;
+ return;
+ }
- if (ret == -EIO)
- cachefiles_io_error(cache,
- "Attempt to sync backing fs superblock"
- " returned error %d",
- ret);
+ /* The file is being expanded. We don't need to do anything
+ * particularly. cookie->initial_size doesn't change and so the point
+ * at which we have to download before doesn't change.
+ */
+ cookie->object_size = new_size;
}
/*
- * check if the backing cache is updated to FS-Cache
- * - called by FS-Cache when evaluates if need to invalidate the cache
+ * Commit changes to the object as we drop it.
*/
-static int cachefiles_check_consistency(struct fscache_operation *op)
+static void cachefiles_commit_object(struct cachefiles_object *object,
+ struct cachefiles_cache *cache)
{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- const struct cred *saved_cred;
- int ret;
+ bool update = false;
- _enter("{OBJ%x}", op->object->debug_id);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags))
+ update = true;
+ if (test_and_clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags))
+ update = true;
+ if (update)
+ cachefiles_set_object_xattr(object);
- object = container_of(op->object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
+ if (test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags))
+ cachefiles_commit_tmpfile(cache, object);
+}
- cachefiles_begin_secure(cache, &saved_cred);
- ret = cachefiles_check_auxdata(object);
- cachefiles_end_secure(cache, saved_cred);
+/*
+ * Finalise and object and close the VFS structs that we have.
+ */
+static void cachefiles_clean_up_object(struct cachefiles_object *object,
+ struct cachefiles_cache *cache)
+{
+ if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) {
+ if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+ cachefiles_see_object(object, cachefiles_obj_see_clean_delete);
+ _debug("- inval object OBJ%x", object->debug_id);
+ cachefiles_delete_object(object, FSCACHE_OBJECT_WAS_RETIRED);
+ } else {
+ cachefiles_see_object(object, cachefiles_obj_see_clean_drop_tmp);
+ _debug("- inval object OBJ%x tmpfile", object->debug_id);
+ }
+ } else {
+ cachefiles_see_object(object, cachefiles_obj_see_clean_commit);
+ cachefiles_commit_object(object, cache);
+ }
- _leave(" = %d", ret);
- return ret;
+ cachefiles_unmark_inode_in_use(object, object->file);
+ if (object->file) {
+ fput(object->file);
+ object->file = NULL;
+ }
}
/*
- * notification the attributes on an object have changed
- * - called with reads/writes excluded by FS-Cache
+ * Withdraw caching for a cookie.
*/
-static int cachefiles_attr_changed(struct fscache_object *_object)
+static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
+ struct cachefiles_object *object = cookie->cache_priv;
+ struct cachefiles_cache *cache = object->volume->cache;
const struct cred *saved_cred;
- struct iattr newattrs;
- uint64_t ni_size;
- loff_t oi_size;
- int ret;
-
- ni_size = _object->store_limit_l;
-
- _enter("{OBJ%x},[%llu]",
- _object->debug_id, (unsigned long long) ni_size);
-
- object = container_of(_object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- if (ni_size == object->i_size)
- return 0;
-
- if (!object->backer)
- return -ENOBUFS;
- ASSERT(d_is_reg(object->backer));
+ _enter("o=%x", object->debug_id);
+ cachefiles_see_object(object, cachefiles_obj_see_withdraw_cookie);
- fscache_set_store_limit(&object->fscache, ni_size);
-
- oi_size = i_size_read(d_backing_inode(object->backer));
- if (oi_size == ni_size)
- return 0;
-
- cachefiles_begin_secure(cache, &saved_cred);
- inode_lock(d_inode(object->backer));
-
- /* if there's an extension to a partial page at the end of the backing
- * file, we need to discard the partial page so that we pick up new
- * data after it */
- if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
- _debug("discard tail %llx", oi_size);
- newattrs.ia_valid = ATTR_SIZE;
- newattrs.ia_size = oi_size & PAGE_MASK;
- ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
- if (ret < 0)
- goto truncate_failed;
+ if (!list_empty(&object->cache_link)) {
+ spin_lock(&cache->object_list_lock);
+ cachefiles_see_object(object, cachefiles_obj_see_withdrawal);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
}
- newattrs.ia_valid = ATTR_SIZE;
- newattrs.ia_size = ni_size;
- ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
-
-truncate_failed:
- inode_unlock(d_inode(object->backer));
- cachefiles_end_secure(cache, saved_cred);
-
- if (ret == -EIO) {
- fscache_set_store_limit(&object->fscache, 0);
- cachefiles_io_error_obj(object, "Size set failed");
- ret = -ENOBUFS;
+ if (object->file) {
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_clean_up_object(object, cache);
+ cachefiles_end_secure(cache, saved_cred);
}
- _leave(" = %d", ret);
- return ret;
+ cookie->cache_priv = NULL;
+ cachefiles_put_object(object, cachefiles_obj_put_detach);
}
/*
- * Invalidate an object
+ * Invalidate the storage associated with a cookie.
*/
-static void cachefiles_invalidate_object(struct fscache_operation *op)
+static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie)
{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- const struct cred *saved_cred;
- struct path path;
- uint64_t ni_size;
- int ret;
+ struct cachefiles_object *object = cookie->cache_priv;
+ struct file *new_file, *old_file;
+ bool old_tmpfile;
- object = container_of(op->object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
+ _enter("o=%x,[%llu]", object->debug_id, object->cookie->object_size);
- ni_size = op->object->store_limit_l;
+ old_tmpfile = test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
- _enter("{OBJ%x},[%llu]",
- op->object->debug_id, (unsigned long long)ni_size);
+ if (!object->file) {
+ fscache_resume_after_invalidation(cookie);
+ _leave(" = t [light]");
+ return true;
+ }
- if (object->backer) {
- ASSERT(d_is_reg(object->backer));
+ new_file = cachefiles_create_tmpfile(object);
+ if (IS_ERR(new_file))
+ goto failed;
- fscache_set_store_limit(&object->fscache, ni_size);
+ /* Substitute the VFS target */
+ _debug("sub");
+ spin_lock(&object->lock);
- path.dentry = object->backer;
- path.mnt = cache->mnt;
+ old_file = object->file;
+ object->file = new_file;
+ object->content_info = CACHEFILES_CONTENT_NO_DATA;
+ set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+ set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags);
- cachefiles_begin_secure(cache, &saved_cred);
- ret = vfs_truncate(&path, 0);
- if (ret == 0)
- ret = vfs_truncate(&path, ni_size);
- cachefiles_end_secure(cache, saved_cred);
+ spin_unlock(&object->lock);
+ _debug("subbed");
+
+ /* Allow I/O to take place again */
+ fscache_resume_after_invalidation(cookie);
+
+ if (old_file) {
+ if (!old_tmpfile) {
+ struct cachefiles_volume *volume = object->volume;
+ struct dentry *fan = volume->fanout[(u8)cookie->key_hash];
- if (ret != 0) {
- fscache_set_store_limit(&object->fscache, 0);
- if (ret == -EIO)
- cachefiles_io_error_obj(object,
- "Invalidate failed");
+ inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
+ cachefiles_bury_object(volume->cache, object, fan,
+ old_file->f_path.dentry,
+ FSCACHE_OBJECT_INVALIDATED);
}
+ fput(old_file);
}
- fscache_op_complete(op, true);
- _leave("");
-}
+ _leave(" = t");
+ return true;
-/*
- * dissociate a cache from all the pages it was backing
- */
-static void cachefiles_dissociate_pages(struct fscache_cache *cache)
-{
- _enter("");
+failed:
+ _leave(" = f");
+ return false;
}
const struct fscache_cache_ops cachefiles_cache_ops = {
.name = "cachefiles",
- .alloc_object = cachefiles_alloc_object,
- .lookup_object = cachefiles_lookup_object,
- .lookup_complete = cachefiles_lookup_complete,
- .grab_object = cachefiles_grab_object,
- .update_object = cachefiles_update_object,
- .invalidate_object = cachefiles_invalidate_object,
- .drop_object = cachefiles_drop_object,
- .put_object = cachefiles_put_object,
- .sync_cache = cachefiles_sync_cache,
- .attr_changed = cachefiles_attr_changed,
- .read_or_alloc_page = cachefiles_read_or_alloc_page,
- .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
- .allocate_page = cachefiles_allocate_page,
- .allocate_pages = cachefiles_allocate_pages,
- .write_page = cachefiles_write_page,
- .uncache_page = cachefiles_uncache_page,
- .dissociate_pages = cachefiles_dissociate_pages,
- .check_consistency = cachefiles_check_consistency,
- .begin_read_operation = cachefiles_begin_read_operation,
+ .acquire_volume = cachefiles_acquire_volume,
+ .free_volume = cachefiles_free_volume,
+ .lookup_cookie = cachefiles_lookup_cookie,
+ .withdraw_cookie = cachefiles_withdraw_cookie,
+ .invalidate_cookie = cachefiles_invalidate_cookie,
+ .begin_operation = cachefiles_begin_operation,
+ .resize_cookie = cachefiles_resize_cookie,
+ .prepare_to_write = cachefiles_prepare_to_write,
};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 0a511c36dab8..c793d33b0224 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* General netfs cache on cache files internal defs
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
@@ -13,58 +13,72 @@
#include <linux/fscache-cache.h>
-#include <linux/timer.h>
-#include <linux/wait_bit.h>
#include <linux/cred.h>
-#include <linux/workqueue.h>
#include <linux/security.h>
+#define CACHEFILES_DIO_BLOCK_SIZE 4096
+
struct cachefiles_cache;
struct cachefiles_object;
-extern unsigned cachefiles_debug;
-#define CACHEFILES_DEBUG_KENTER 1
-#define CACHEFILES_DEBUG_KLEAVE 2
-#define CACHEFILES_DEBUG_KDEBUG 4
+enum cachefiles_content {
+ /* These values are saved on disk */
+ CACHEFILES_CONTENT_NO_DATA = 0, /* No content stored */
+ CACHEFILES_CONTENT_SINGLE = 1, /* Content is monolithic, all is present */
+ CACHEFILES_CONTENT_ALL = 2, /* Content is all present, no map */
+ CACHEFILES_CONTENT_BACKFS_MAP = 3, /* Content is piecemeal, mapped through backing fs */
+ CACHEFILES_CONTENT_DIRTY = 4, /* Content is dirty (only seen on disk) */
+ nr__cachefiles_content
+};
-#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
+/*
+ * Cached volume representation.
+ */
+struct cachefiles_volume {
+ struct cachefiles_cache *cache;
+ struct list_head cache_link; /* Link in cache->volumes */
+ struct fscache_volume *vcookie; /* The netfs's representation */
+ struct dentry *dentry; /* The volume dentry */
+ struct dentry *fanout[256]; /* Fanout subdirs */
+};
/*
- * node records
+ * Backing file state.
*/
struct cachefiles_object {
- struct fscache_object fscache; /* fscache handle */
- struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
- struct dentry *dentry; /* the file/dir representing this object */
- struct dentry *backer; /* backing file */
- loff_t i_size; /* object size */
+ struct fscache_cookie *cookie; /* Netfs data storage object cookie */
+ struct cachefiles_volume *volume; /* Cache volume that holds this object */
+ struct list_head cache_link; /* Link in cache->*_list */
+ struct file *file; /* The file representing this object */
+ char *d_name; /* Backing file name */
+ int debug_id;
+ spinlock_t lock;
+ refcount_t ref;
+ u8 d_name_len; /* Length of filename */
+ enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
-#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
- atomic_t usage; /* object usage count */
- uint8_t type; /* object type */
- uint8_t new; /* T if object new */
- spinlock_t work_lock;
- struct rb_node active_node; /* link in active tree (dentry is key) */
+#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
};
-extern struct kmem_cache *cachefiles_object_jar;
-
/*
* Cache files cache definition
*/
struct cachefiles_cache {
- struct fscache_cache cache; /* FS-Cache record */
+ struct fscache_cache *cache; /* Cache cookie */
struct vfsmount *mnt; /* mountpoint holding the cache */
+ struct dentry *store; /* Directory into which live objects go */
struct dentry *graveyard; /* directory into which dead objects go */
struct file *cachefilesd; /* manager daemon handle */
+ struct list_head volumes; /* List of volume objects */
+ struct list_head object_list; /* List of active objects */
+ spinlock_t object_list_lock; /* Lock for volumes and object_list */
const struct cred *cache_cred; /* security override for accessing cache */
struct mutex daemon_mutex; /* command serialisation mutex */
wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
- struct rb_root active_nodes; /* active nodes (can't be culled) */
- rwlock_t active_lock; /* lock for active_nodes */
atomic_t gravecounter; /* graveyard uniquifier */
atomic_t f_released; /* number of objects released lately */
atomic_long_t b_released; /* number of blocks released lately */
+ atomic_long_t b_writing; /* Number of blocks being written */
unsigned frun_percent; /* when to stop culling (% files) */
unsigned fcull_percent; /* when to start culling (% files) */
unsigned fstop_percent; /* when to stop allocating (% files) */
@@ -72,7 +86,7 @@ struct cachefiles_cache {
unsigned bcull_percent; /* when to start culling (% blocks) */
unsigned bstop_percent; /* when to stop allocating (% blocks) */
unsigned bsize; /* cache's block size */
- unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
+ unsigned bshift; /* ilog2(bsize) */
uint64_t frun; /* when to stop culling */
uint64_t fcull; /* when to start culling */
uint64_t fstop; /* when to stop allocating */
@@ -89,38 +103,19 @@ struct cachefiles_cache {
char *tag; /* cache binding tag */
};
-/*
- * backing file read tracking
- */
-struct cachefiles_one_read {
- wait_queue_entry_t monitor; /* link into monitored waitqueue */
- struct page *back_page; /* backing file page we're waiting for */
- struct page *netfs_page; /* netfs page we're going to fill */
- struct fscache_retrieval *op; /* retrieval op covering this */
- struct list_head op_link; /* link in op's todo list */
-};
-
-/*
- * backing file write tracking
- */
-struct cachefiles_one_write {
- struct page *netfs_page; /* netfs page to copy */
- struct cachefiles_object *object;
- struct list_head obj_link; /* link in object's lists */
- fscache_rw_complete_t end_io_func;
- void *context;
-};
+#include <trace/events/cachefiles.h>
-/*
- * auxiliary data xattr buffer
- */
-struct cachefiles_xattr {
- uint16_t len;
- uint8_t type;
- uint8_t data[];
-};
+static inline
+struct file *cachefiles_cres_file(struct netfs_cache_resources *cres)
+{
+ return cres->cache_priv2;
+}
-#include <trace/events/cachefiles.h>
+static inline
+struct cachefiles_object *cachefiles_cres_object(struct netfs_cache_resources *cres)
+{
+ return fscache_cres_cookie(cres)->cache_priv;
+}
/*
* note change of state for daemon
@@ -132,74 +127,118 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
}
/*
- * bind.c
+ * cache.c
*/
-extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
-extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+extern int cachefiles_add_cache(struct cachefiles_cache *cache);
+extern void cachefiles_withdraw_cache(struct cachefiles_cache *cache);
+
+enum cachefiles_has_space_for {
+ cachefiles_has_space_check,
+ cachefiles_has_space_for_write,
+ cachefiles_has_space_for_create,
+};
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+ unsigned fnr, unsigned bnr,
+ enum cachefiles_has_space_for reason);
/*
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
-extern int cachefiles_has_space(struct cachefiles_cache *cache,
- unsigned fnr, unsigned bnr);
+/*
+ * error_inject.c
+ */
+#ifdef CONFIG_CACHEFILES_ERROR_INJECTION
+extern unsigned int cachefiles_error_injection_state;
+extern int cachefiles_register_error_injection(void);
+extern void cachefiles_unregister_error_injection(void);
+
+#else
+#define cachefiles_error_injection_state 0
+
+static inline int cachefiles_register_error_injection(void)
+{
+ return 0;
+}
+
+static inline void cachefiles_unregister_error_injection(void)
+{
+}
+#endif
+
+
+static inline int cachefiles_inject_read_error(void)
+{
+ return cachefiles_error_injection_state & 2 ? -EIO : 0;
+}
+
+static inline int cachefiles_inject_write_error(void)
+{
+ return cachefiles_error_injection_state & 2 ? -EIO :
+ cachefiles_error_injection_state & 1 ? -ENOSPC :
+ 0;
+}
+
+static inline int cachefiles_inject_remove_error(void)
+{
+ return cachefiles_error_injection_state & 2 ? -EIO : 0;
+}
/*
* interface.c
*/
extern const struct fscache_cache_ops cachefiles_cache_ops;
+extern void cachefiles_see_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why);
+extern struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why);
+extern void cachefiles_put_object(struct cachefiles_object *object,
+ enum cachefiles_obj_ref_trace why);
-void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why);
+/*
+ * io.c
+ */
+extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
+ enum fscache_want_state want_state);
/*
* key.c
*/
-extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+extern bool cachefiles_cook_key(struct cachefiles_object *object);
+
+/*
+ * main.c
+ */
+extern struct kmem_cache *cachefiles_object_jar;
/*
* namei.c
*/
-extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object,
- blkcnt_t i_blocks);
-extern int cachefiles_delete_object(struct cachefiles_cache *cache,
- struct cachefiles_object *object);
-extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
- struct cachefiles_object *object,
- const char *key,
- struct cachefiles_xattr *auxdata);
+extern void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+ struct file *file);
+extern int cachefiles_bury_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
+ struct dentry *dir,
+ struct dentry *rep,
+ enum fscache_why_object_killed why);
+extern int cachefiles_delete_object(struct cachefiles_object *object,
+ enum fscache_why_object_killed why);
+extern bool cachefiles_look_up_object(struct cachefiles_object *object);
extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
struct dentry *dir,
- const char *name);
+ const char *name,
+ bool *_is_new);
+extern void cachefiles_put_directory(struct dentry *dir);
extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
char *filename);
extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
struct dentry *dir, char *filename);
-
-/*
- * rdwr.c
- */
-extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
- struct page *, gfp_t);
-extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
- struct list_head *, unsigned *,
- gfp_t);
-extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
- gfp_t);
-extern int cachefiles_allocate_pages(struct fscache_retrieval *,
- struct list_head *, unsigned *, gfp_t);
-extern int cachefiles_write_page(struct fscache_storage *, struct page *);
-extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
-
-/*
- * rdwr2.c
- */
-extern int cachefiles_begin_read_operation(struct netfs_read_request *,
- struct fscache_retrieval *);
+extern struct file *cachefiles_create_tmpfile(struct cachefiles_object *object);
+extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
+ struct cachefiles_object *object);
/*
* security.c
@@ -222,28 +261,32 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
}
/*
+ * volume.c
+ */
+void cachefiles_acquire_volume(struct fscache_volume *volume);
+void cachefiles_free_volume(struct fscache_volume *volume);
+void cachefiles_withdraw_volume(struct cachefiles_volume *volume);
+
+/*
* xattr.c
*/
-extern int cachefiles_check_object_type(struct cachefiles_object *object);
-extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata);
-extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata);
-extern int cachefiles_check_auxdata(struct cachefiles_object *object);
-extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object);
+extern int cachefiles_check_auxdata(struct cachefiles_object *object,
+ struct file *file);
extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
struct dentry *dentry);
-
+extern void cachefiles_prepare_to_write(struct fscache_cookie *cookie);
+extern bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume);
+extern int cachefiles_check_volume_xattr(struct cachefiles_volume *volume);
/*
- * error handling
+ * Error handling
*/
-
#define cachefiles_io_error(___cache, FMT, ...) \
do { \
pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
- fscache_io_error(&(___cache)->cache); \
+ fscache_io_error((___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
} while (0)
@@ -251,15 +294,20 @@ do { \
do { \
struct cachefiles_cache *___cache; \
\
- ___cache = container_of((object)->fscache.cache, \
- struct cachefiles_cache, cache); \
- cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
+ ___cache = (object)->volume->cache; \
+ cachefiles_io_error(___cache, FMT " [o=%08x]", ##__VA_ARGS__, \
+ (object)->debug_id); \
} while (0)
/*
- * debug tracing
+ * Debug tracing
*/
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER 1
+#define CACHEFILES_DEBUG_KLEAVE 2
+#define CACHEFILES_DEBUG_KDEBUG 4
+
#define dbgprintk(FMT, ...) \
printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index fac2e8e7b533..9dc81e781f2b 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -9,8 +9,9 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/falloc.h>
#include <linux/sched/mm.h>
-#include <linux/netfs.h>
+#include <trace/events/fscache.h>
#include "internal.h"
struct cachefiles_kiocb {
@@ -21,14 +22,18 @@ struct cachefiles_kiocb {
size_t skipped;
size_t len;
};
+ struct cachefiles_object *object;
netfs_io_terminated_t term_func;
void *term_func_priv;
bool was_async;
+ unsigned int inval_counter; /* Copy of cookie->inval_counter */
+ u64 b_writing;
};
static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
{
if (refcount_dec_and_test(&ki->ki_refcnt)) {
+ cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq);
fput(ki->iocb.ki_filp);
kfree(ki);
}
@@ -37,15 +42,25 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
/*
* Handle completion of a read from the cache.
*/
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+ struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+ _enter("%ld", ret);
- _enter("%ld,%ld", ret, ret2);
+ if (ret < 0)
+ trace_cachefiles_io_error(ki->object, inode, ret,
+ cachefiles_trace_read_error);
if (ki->term_func) {
- if (ret >= 0)
- ret += ki->skipped;
+ if (ret >= 0) {
+ if (ki->object->cookie->inval_counter == ki->inval_counter)
+ ki->skipped += ret;
+ else
+ ret = -ESTALE;
+ }
+
ki->term_func(ki->term_func_priv, ret, ki->was_async);
}
@@ -58,16 +73,24 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
static int cachefiles_read(struct netfs_cache_resources *cres,
loff_t start_pos,
struct iov_iter *iter,
- bool seek_data,
+ enum netfs_read_from_hole read_hole,
netfs_io_terminated_t term_func,
void *term_func_priv)
{
+ struct cachefiles_object *object;
struct cachefiles_kiocb *ki;
- struct file *file = cres->cache_priv2;
+ struct file *file;
unsigned int old_nofs;
ssize_t ret = -ENOBUFS;
size_t len = iov_iter_count(iter), skipped = 0;
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+ goto presubmission_error;
+
+ fscache_count_read();
+ object = cachefiles_cres_object(cres);
+ file = cachefiles_cres_file(cres);
+
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
i_size_read(file_inode(file)));
@@ -75,10 +98,12 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
/* If the caller asked us to seek for data before doing the read, then
* we should do that now. If we find a gap, we fill it with zeros.
*/
- if (seek_data) {
+ if (read_hole != NETFS_READ_HOLE_IGNORE) {
loff_t off = start_pos, off2;
- off2 = vfs_llseek(file, off, SEEK_DATA);
+ off2 = cachefiles_inject_read_error();
+ if (off2 == 0)
+ off2 = vfs_llseek(file, off, SEEK_DATA);
if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
skipped = 0;
ret = off2;
@@ -90,6 +115,10 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
* in the region, so clear the rest of the buffer and
* return success.
*/
+ ret = -ENODATA;
+ if (read_hole == NETFS_READ_HOLE_FAIL)
+ goto presubmission_error;
+
iov_iter_zero(len, iter);
skipped = len;
ret = 0;
@@ -100,7 +129,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
iov_iter_zero(skipped, iter);
}
- ret = -ENOBUFS;
+ ret = -ENOMEM;
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki)
goto presubmission_error;
@@ -109,9 +138,10 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos + skipped;
ki->iocb.ki_flags = IOCB_DIRECT;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
ki->skipped = skipped;
+ ki->object = object;
+ ki->inval_counter = cres->inval_counter;
ki->term_func = term_func;
ki->term_func_priv = term_func_priv;
ki->was_async = true;
@@ -120,9 +150,13 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
ki->iocb.ki_complete = cachefiles_read_complete;
get_file(ki->iocb.ki_filp);
+ cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
+ trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped);
old_nofs = memalloc_nofs_save();
- ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
memalloc_nofs_restore(old_nofs);
switch (ret) {
case -EIOCBQUEUED:
@@ -139,7 +173,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_read_complete(&ki->iocb, ret, 0);
+ cachefiles_read_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
@@ -157,22 +191,86 @@ presubmission_error:
}
/*
+ * Query the occupancy of the cache in a region, returning where the next chunk
+ * of data starts and how long it is.
+ */
+static int cachefiles_query_occupancy(struct netfs_cache_resources *cres,
+ loff_t start, size_t len, size_t granularity,
+ loff_t *_data_start, size_t *_data_len)
+{
+ struct cachefiles_object *object;
+ struct file *file;
+ loff_t off, off2;
+
+ *_data_start = -1;
+ *_data_len = 0;
+
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+ return -ENOBUFS;
+
+ object = cachefiles_cres_object(cres);
+ file = cachefiles_cres_file(cres);
+ granularity = max_t(size_t, object->volume->cache->bsize, granularity);
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start, len,
+ i_size_read(file_inode(file)));
+
+ off = cachefiles_inject_read_error();
+ if (off == 0)
+ off = vfs_llseek(file, start, SEEK_DATA);
+ if (off == -ENXIO)
+ return -ENODATA; /* Beyond EOF */
+ if (off < 0 && off >= (loff_t)-MAX_ERRNO)
+ return -ENOBUFS; /* Error. */
+ if (round_up(off, granularity) >= start + len)
+ return -ENODATA; /* No data in range */
+
+ off2 = cachefiles_inject_read_error();
+ if (off2 == 0)
+ off2 = vfs_llseek(file, off, SEEK_HOLE);
+ if (off2 == -ENXIO)
+ return -ENODATA; /* Beyond EOF */
+ if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO)
+ return -ENOBUFS; /* Error. */
+
+ /* Round away partial blocks */
+ off = round_up(off, granularity);
+ off2 = round_down(off2, granularity);
+ if (off2 <= off)
+ return -ENODATA;
+
+ *_data_start = off;
+ if (off2 > start + len)
+ *_data_len = len;
+ else
+ *_data_len = off2 - off;
+ return 0;
+}
+
+/*
* Handle completion of a write to the cache.
*/
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+ struct cachefiles_object *object = ki->object;
struct inode *inode = file_inode(ki->iocb.ki_filp);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
/* Tell lockdep we inherited freeze protection from submission thread */
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+ if (ret < 0)
+ trace_cachefiles_io_error(object, inode, ret,
+ cachefiles_trace_write_error);
+
+ atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
+ set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
if (ki->term_func)
ki->term_func(ki->term_func_priv, ret, ki->was_async);
-
cachefiles_put_kiocb(ki);
}
@@ -185,17 +283,27 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
netfs_io_terminated_t term_func,
void *term_func_priv)
{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
struct inode *inode;
- struct file *file = cres->cache_priv2;
+ struct file *file;
unsigned int old_nofs;
ssize_t ret = -ENOBUFS;
size_t len = iov_iter_count(iter);
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+ goto presubmission_error;
+ fscache_count_write();
+ object = cachefiles_cres_object(cres);
+ cache = object->volume->cache;
+ file = cachefiles_cres_file(cres);
+
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
i_size_read(file_inode(file)));
+ ret = -ENOMEM;
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki)
goto presubmission_error;
@@ -204,16 +312,19 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos;
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
+ ki->object = object;
+ ki->inval_counter = cres->inval_counter;
ki->start = start_pos;
ki->len = len;
ki->term_func = term_func;
ki->term_func_priv = term_func_priv;
ki->was_async = true;
+ ki->b_writing = (len + (1 << cache->bshift) - 1) >> cache->bshift;
if (ki->term_func)
ki->iocb.ki_complete = cachefiles_write_complete;
+ atomic_long_add(ki->b_writing, &cache->b_writing);
/* Open-code file_start_write here to grab freeze protection, which
* will be released by another thread in aio_complete_rw(). Fool
@@ -225,9 +336,13 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
get_file(ki->iocb.ki_filp);
+ cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
+ trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
old_nofs = memalloc_nofs_save();
- ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
memalloc_nofs_restore(old_nofs);
switch (ret) {
case -EIOCBQUEUED:
@@ -244,7 +359,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_write_complete(&ki->iocb, ret, 0);
+ cachefiles_write_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
@@ -257,58 +372,93 @@ in_progress:
presubmission_error:
if (term_func)
- term_func(term_func_priv, -ENOMEM, false);
- return -ENOMEM;
+ term_func(term_func_priv, ret, false);
+ return ret;
}
/*
* Prepare a read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
-static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq,
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
loff_t i_size)
{
- struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv;
+ enum cachefiles_prepare_read_trace why;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
struct cachefiles_object *object;
struct cachefiles_cache *cache;
+ struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
- struct file *file = subreq->rreq->cache_resources.cache_priv2;
+ struct file *file = cachefiles_cres_file(cres);
+ enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
loff_t off, to;
+ ino_t ino = file ? file_inode(file)->i_ino : 0;
_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
+ if (subreq->start >= i_size) {
+ ret = NETFS_FILL_WITH_ZEROES;
+ why = cachefiles_trace_read_after_eof;
+ goto out_no_object;
+ }
- if (!file)
- goto cache_fail_nosec;
+ if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ why = cachefiles_trace_read_no_data;
+ goto out_no_object;
+ }
- if (subreq->start >= i_size)
- return NETFS_FILL_WITH_ZEROES;
+ /* The object and the file may be being created in the background. */
+ if (!file) {
+ why = cachefiles_trace_read_no_file;
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+ goto out_no_object;
+ file = cachefiles_cres_file(cres);
+ if (!file)
+ goto out_no_object;
+ ino = file_inode(file)->i_ino;
+ }
+ object = cachefiles_cres_object(cres);
+ cache = object->volume->cache;
cachefiles_begin_secure(cache, &saved_cred);
- off = vfs_llseek(file, subreq->start, SEEK_DATA);
+ off = cachefiles_inject_read_error();
+ if (off == 0)
+ off = vfs_llseek(file, subreq->start, SEEK_DATA);
if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
- if (off == (loff_t)-ENXIO)
+ if (off == (loff_t)-ENXIO) {
+ why = cachefiles_trace_read_seek_nxio;
goto download_and_store;
- goto cache_fail;
+ }
+ trace_cachefiles_io_error(object, file_inode(file), off,
+ cachefiles_trace_seek_error);
+ why = cachefiles_trace_read_seek_error;
+ goto out;
}
- if (off >= subreq->start + subreq->len)
+ if (off >= subreq->start + subreq->len) {
+ why = cachefiles_trace_read_found_hole;
goto download_and_store;
+ }
if (off > subreq->start) {
off = round_up(off, cache->bsize);
subreq->len = off - subreq->start;
+ why = cachefiles_trace_read_found_part;
goto download_and_store;
}
- to = vfs_llseek(file, subreq->start, SEEK_HOLE);
- if (to < 0 && to >= (loff_t)-MAX_ERRNO)
- goto cache_fail;
+ to = cachefiles_inject_read_error();
+ if (to == 0)
+ to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+ if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
+ trace_cachefiles_io_error(object, file_inode(file), to,
+ cachefiles_trace_seek_error);
+ why = cachefiles_trace_read_seek_error;
+ goto out;
+ }
if (to < subreq->start + subreq->len) {
if (subreq->start + subreq->len >= i_size)
@@ -318,32 +468,119 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque
subreq->len = to - subreq->start;
}
- cachefiles_end_secure(cache, saved_cred);
- return NETFS_READ_FROM_CACHE;
+ why = cachefiles_trace_read_have_data;
+ ret = NETFS_READ_FROM_CACHE;
+ goto out;
download_and_store:
- if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0)
- __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
-cache_fail:
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+out:
cachefiles_end_secure(cache, saved_cred);
-cache_fail_nosec:
- return NETFS_DOWNLOAD_FROM_SERVER;
+out_no_object:
+ trace_cachefiles_prep_read(subreq, ret, why, ino);
+ return ret;
}
/*
* Prepare for a write to occur.
*/
-static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size)
+static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
+ loff_t *_start, size_t *_len, loff_t i_size,
+ bool no_space_allocated_yet)
{
- loff_t start = *_start;
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct file *file = cachefiles_cres_file(cres);
+ loff_t start = *_start, pos;
size_t len = *_len, down;
+ int ret;
/* Round to DIO size */
down = start - round_down(start, PAGE_SIZE);
*_start = start - down;
*_len = round_up(down + len, PAGE_SIZE);
- return 0;
+
+ /* We need to work out whether there's sufficient disk space to perform
+ * the write - but we can skip that check if we have space already
+ * allocated.
+ */
+ if (no_space_allocated_yet)
+ goto check_space;
+
+ pos = cachefiles_inject_read_error();
+ if (pos == 0)
+ pos = vfs_llseek(file, *_start, SEEK_DATA);
+ if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
+ if (pos == -ENXIO)
+ goto check_space; /* Unallocated tail */
+ trace_cachefiles_io_error(object, file_inode(file), pos,
+ cachefiles_trace_seek_error);
+ return pos;
+ }
+ if ((u64)pos >= (u64)*_start + *_len)
+ goto check_space; /* Unallocated region */
+
+ /* We have a block that's at least partially filled - if we're low on
+ * space, we need to see if it's fully allocated. If it's not, we may
+ * want to cull it.
+ */
+ if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
+ cachefiles_has_space_check) == 0)
+ return 0; /* Enough space to simply overwrite the whole block */
+
+ pos = cachefiles_inject_read_error();
+ if (pos == 0)
+ pos = vfs_llseek(file, *_start, SEEK_HOLE);
+ if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
+ trace_cachefiles_io_error(object, file_inode(file), pos,
+ cachefiles_trace_seek_error);
+ return pos;
+ }
+ if ((u64)pos >= (u64)*_start + *_len)
+ return 0; /* Fully allocated */
+
+ /* Partially allocated, but insufficient space: cull. */
+ fscache_count_no_write_space();
+ ret = cachefiles_inject_remove_error();
+ if (ret == 0)
+ ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ *_start, *_len);
+ if (ret < 0) {
+ trace_cachefiles_io_error(object, file_inode(file), ret,
+ cachefiles_trace_fallocate_error);
+ cachefiles_io_error_obj(object,
+ "CacheFiles: fallocate failed (%d)\n", ret);
+ ret = -EIO;
+ }
+
+ return ret;
+
+check_space:
+ return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
+ cachefiles_has_space_for_write);
+}
+
+static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
+ loff_t *_start, size_t *_len, loff_t i_size,
+ bool no_space_allocated_yet)
+{
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+ struct cachefiles_cache *cache = object->volume->cache;
+ const struct cred *saved_cred;
+ int ret;
+
+ if (!cachefiles_cres_file(cres)) {
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+ return -ENOBUFS;
+ if (!cachefiles_cres_file(cres))
+ return -ENOBUFS;
+ }
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
+ no_space_allocated_yet);
+ cachefiles_end_secure(cache, saved_cred);
+ return ret;
}
/*
@@ -351,19 +588,11 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
*/
static void cachefiles_end_operation(struct netfs_cache_resources *cres)
{
- struct fscache_retrieval *op = cres->cache_priv;
- struct file *file = cres->cache_priv2;
-
- _enter("");
+ struct file *file = cachefiles_cres_file(cres);
if (file)
fput(file);
- if (op) {
- fscache_op_complete(&op->op, false);
- fscache_put_retrieval(op);
- }
-
- _leave("");
+ fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end);
}
static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
@@ -372,49 +601,31 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .query_occupancy = cachefiles_query_occupancy,
};
/*
* Open the cache file when beginning a cache operation.
*/
-int cachefiles_begin_read_operation(struct netfs_read_request *rreq,
- struct fscache_retrieval *op)
+bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
+ enum fscache_want_state want_state)
{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct path path;
- struct file *file;
-
- _enter("");
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- path.mnt = cache->mnt;
- path.dentry = object->backer;
- file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_inode(object->backer), cache->cache_cred);
- if (IS_ERR(file))
- return PTR_ERR(file);
- if (!S_ISREG(file_inode(file)->i_mode))
- goto error_file;
- if (unlikely(!file->f_op->read_iter) ||
- unlikely(!file->f_op->write_iter)) {
- pr_notice("Cache does not support read_iter and write_iter\n");
- goto error_file;
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+
+ if (!cachefiles_cres_file(cres)) {
+ cres->ops = &cachefiles_netfs_cache_ops;
+ if (object->file) {
+ spin_lock(&object->lock);
+ if (!cres->cache_priv2 && object->file)
+ cres->cache_priv2 = get_file(object->file);
+ spin_unlock(&object->lock);
+ }
}
- fscache_get_retrieval(op);
- rreq->cache_resources.cache_priv = op;
- rreq->cache_resources.cache_priv2 = file;
- rreq->cache_resources.ops = &cachefiles_netfs_cache_ops;
- rreq->cache_resources.debug_id = object->fscache.debug_id;
- _leave("");
- return 0;
+ if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) {
+ pr_err("failed to get cres->file\n");
+ return false;
+ }
-error_file:
- fput(file);
- return -EIO;
+ return true;
}
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 7f94efc97e23..bf935e25bdbe 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Key to pathname encoder
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
@@ -22,134 +22,117 @@ static const char cachefiles_filecharmap[256] = {
[48 ... 127] = 1, /* '0' -> '~' */
};
+static inline unsigned int how_many_hex_digits(unsigned int x)
+{
+ return x ? round_up(ilog2(x) + 1, 4) / 4 : 0;
+}
+
/*
* turn the raw key into something cooked
- * - the raw key should include the length in the two bytes at the front
- * - the key may be up to 514 bytes in length (including the length word)
+ * - the key may be up to NAME_MAX in length (including the length word)
* - "base64" encode the strange keys, mapping 3 bytes of raw to four of
* cooked
* - need to cut the cooked key into 252 char lengths (189 raw bytes)
*/
-char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+bool cachefiles_cook_key(struct cachefiles_object *object)
{
- unsigned char csum, ch;
- unsigned int acc;
- char *key;
- int loop, len, max, seg, mark, print;
+ const u8 *key = fscache_get_key(object->cookie), *kend;
+ unsigned char ch;
+ unsigned int acc, i, n, nle, nbe, keylen = object->cookie->key_len;
+ unsigned int b64len, len, print, pad;
+ char *name, sep;
- _enter(",%d", keylen);
+ _enter(",%u,%*phN", keylen, keylen, key);
- BUG_ON(keylen < 2 || keylen > 514);
+ BUG_ON(keylen > NAME_MAX - 3);
- csum = raw[0] + raw[1];
print = 1;
- for (loop = 2; loop < keylen; loop++) {
- ch = raw[loop];
- csum += ch;
+ for (i = 0; i < keylen; i++) {
+ ch = key[i];
print &= cachefiles_filecharmap[ch];
}
+ /* If the path is usable ASCII, then we render it directly */
if (print) {
- /* if the path is usable ASCII, then we render it directly */
- max = keylen - 2;
- max += 2; /* two base64'd length chars on the front */
- max += 5; /* @checksum/M */
- max += 3 * 2; /* maximum number of segment dividers (".../M")
- * is ((514 + 251) / 252) = 3
- */
- max += 1; /* NUL on end */
- } else {
- /* calculate the maximum length of the cooked key */
- keylen = (keylen + 2) / 3;
-
- max = keylen * 4;
- max += 5; /* @checksum/M */
- max += 3 * 2; /* maximum number of segment dividers (".../M")
- * is ((514 + 188) / 189) = 3
- */
- max += 1; /* NUL on end */
+ len = 1 + keylen;
+ name = kmalloc(len + 1, GFP_KERNEL);
+ if (!name)
+ return false;
+
+ name[0] = 'D'; /* Data object type, string encoding */
+ memcpy(name + 1, key, keylen);
+ goto success;
}
- max += 1; /* 2nd NUL on end */
-
- _debug("max: %d", max);
-
- key = kmalloc(max, cachefiles_gfp);
- if (!key)
- return NULL;
-
- len = 0;
-
- /* build the cooked key */
- sprintf(key, "@%02x%c+", (unsigned) csum, 0);
- len = 5;
- mark = len - 1;
-
- if (print) {
- acc = *(uint16_t *) raw;
- raw += 2;
-
- key[len + 1] = cachefiles_charmap[acc & 63];
- acc >>= 6;
- key[len] = cachefiles_charmap[acc & 63];
- len += 2;
-
- seg = 250;
- for (loop = keylen; loop > 0; loop--) {
- if (seg <= 0) {
- key[len++] = '\0';
- mark = len;
- key[len++] = '+';
- seg = 252;
- }
-
- key[len++] = *raw++;
- ASSERT(len < max);
- }
-
- switch (type) {
- case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
- case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
- default: type = 'S'; break;
- }
- } else {
- seg = 252;
- for (loop = keylen; loop > 0; loop--) {
- if (seg <= 0) {
- key[len++] = '\0';
- mark = len;
- key[len++] = '+';
- seg = 252;
- }
-
- acc = *raw++;
- acc |= *raw++ << 8;
- acc |= *raw++ << 16;
-
- _debug("acc: %06x", acc);
-
- key[len++] = cachefiles_charmap[acc & 63];
- acc >>= 6;
- key[len++] = cachefiles_charmap[acc & 63];
- acc >>= 6;
- key[len++] = cachefiles_charmap[acc & 63];
- acc >>= 6;
- key[len++] = cachefiles_charmap[acc & 63];
-
- ASSERT(len < max);
- }
+ /* See if it makes sense to encode it as "hex,hex,hex" for each 32-bit
+ * chunk. We rely on the key having been padded out to a whole number
+ * of 32-bit words.
+ */
+ n = round_up(keylen, 4);
+ nbe = nle = 0;
+ for (i = 0; i < n; i += 4) {
+ u32 be = be32_to_cpu(*(__be32 *)(key + i));
+ u32 le = le32_to_cpu(*(__le32 *)(key + i));
+
+ nbe += 1 + how_many_hex_digits(be);
+ nle += 1 + how_many_hex_digits(le);
+ }
- switch (type) {
- case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
- case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
- default: type = 'T'; break;
+ b64len = DIV_ROUND_UP(keylen, 3);
+ pad = b64len * 3 - keylen;
+ b64len = 2 + b64len * 4; /* Length if we base64-encode it */
+ _debug("len=%u nbe=%u nle=%u b64=%u", keylen, nbe, nle, b64len);
+ if (nbe < b64len || nle < b64len) {
+ unsigned int nlen = min(nbe, nle) + 1;
+ name = kmalloc(nlen, GFP_KERNEL);
+ if (!name)
+ return false;
+ sep = (nbe <= nle) ? 'S' : 'T'; /* Encoding indicator */
+ len = 0;
+ for (i = 0; i < n; i += 4) {
+ u32 x;
+ if (nbe <= nle)
+ x = be32_to_cpu(*(__be32 *)(key + i));
+ else
+ x = le32_to_cpu(*(__le32 *)(key + i));
+ name[len++] = sep;
+ if (x != 0)
+ len += snprintf(name + len, nlen - len, "%x", x);
+ sep = ',';
}
+ goto success;
}
- key[mark] = type;
- key[len++] = 0;
- key[len] = 0;
+ /* We need to base64-encode it */
+ name = kmalloc(b64len + 1, GFP_KERNEL);
+ if (!name)
+ return false;
+
+ name[0] = 'E';
+ name[1] = '0' + pad;
+ len = 2;
+ kend = key + keylen;
+ do {
+ acc = *key++;
+ if (key < kend) {
+ acc |= *key++ << 8;
+ if (key < kend)
+ acc |= *key++ << 16;
+ }
- _leave(" = %s %d", key, len);
- return key;
+ name[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ name[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ name[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ name[len++] = cachefiles_charmap[acc & 63];
+ } while (key < kend);
+
+success:
+ name[len] = 0;
+ object->d_name = name;
+ object->d_name_len = len;
+ _leave(" = %s", object->d_name);
+ return true;
}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 9c8d34c49b12..3f369c6f816d 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -2,7 +2,7 @@
/* Network filesystem caching backend to use cache files on a premounted
* filesystem
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
@@ -18,6 +18,8 @@
#include <linux/statfs.h>
#include <linux/sysctl.h>
#include <linux/miscdevice.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#define CREATE_TRACE_POINTS
#include "internal.h"
@@ -37,14 +39,6 @@ static struct miscdevice cachefiles_dev = {
.fops = &cachefiles_daemon_fops,
};
-static void cachefiles_object_init_once(void *_object)
-{
- struct cachefiles_object *object = _object;
-
- memset(object, 0, sizeof(*object));
- spin_lock_init(&object->work_lock);
-}
-
/*
* initialise the fs caching module
*/
@@ -52,6 +46,9 @@ static int __init cachefiles_init(void)
{
int ret;
+ ret = cachefiles_register_error_injection();
+ if (ret < 0)
+ goto error_einj;
ret = misc_register(&cachefiles_dev);
if (ret < 0)
goto error_dev;
@@ -61,9 +58,7 @@ static int __init cachefiles_init(void)
cachefiles_object_jar =
kmem_cache_create("cachefiles_object_jar",
sizeof(struct cachefiles_object),
- 0,
- SLAB_HWCACHE_ALIGN,
- cachefiles_object_init_once);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!cachefiles_object_jar) {
pr_notice("Failed to allocate an object jar\n");
goto error_object_jar;
@@ -75,6 +70,8 @@ static int __init cachefiles_init(void)
error_object_jar:
misc_deregister(&cachefiles_dev);
error_dev:
+ cachefiles_unregister_error_injection();
+error_einj:
pr_err("failed to register: %d\n", ret);
return ret;
}
@@ -90,6 +87,7 @@ static void __exit cachefiles_exit(void)
kmem_cache_destroy(cachefiles_object_jar);
misc_deregister(&cachefiles_dev);
+ cachefiles_unregister_error_injection();
}
module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index a9aca5ab5970..ca9f3e4ec4b3 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -1,295 +1,280 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* CacheFiles path walking and related routines
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/fsnotify.h>
-#include <linux/quotaops.h>
-#include <linux/xattr.h>
-#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/slab.h>
#include "internal.h"
-#define CACHEFILES_KEYBUF_SIZE 512
-
/*
- * dump debugging info about an object
+ * Mark the backing file as being a cache file if it's not already in use. The
+ * mark tells the culling request command that it's not allowed to cull the
+ * file or directory. The caller must hold the inode lock.
*/
-static noinline
-void __cachefiles_printk_object(struct cachefiles_object *object,
- const char *prefix)
+static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
{
- struct fscache_cookie *cookie;
- const u8 *k;
- unsigned loop;
-
- pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
- pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
- prefix, object->fscache.state->name,
- object->fscache.flags, work_busy(&object->fscache.work),
- object->fscache.events, object->fscache.event_mask);
- pr_err("%sops=%u inp=%u exc=%u\n",
- prefix, object->fscache.n_ops, object->fscache.n_in_progress,
- object->fscache.n_exclusive);
- pr_err("%sparent=%x\n",
- prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0);
-
- spin_lock(&object->fscache.lock);
- cookie = object->fscache.cookie;
- if (cookie) {
- pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n",
- prefix,
- cookie->debug_id,
- cookie->parent ? cookie->parent->debug_id : 0,
- cookie->netfs_data,
- cookie->flags);
- pr_err("%skey=[%u] '", prefix, cookie->key_len);
- k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
- cookie->inline_key : cookie->key;
- for (loop = 0; loop < cookie->key_len; loop++)
- pr_cont("%02x", k[loop]);
- pr_cont("'\n");
+ struct inode *inode = d_backing_inode(dentry);
+ bool can_use = false;
+
+ if (!(inode->i_flags & S_KERNEL_FILE)) {
+ inode->i_flags |= S_KERNEL_FILE;
+ trace_cachefiles_mark_active(object, inode);
+ can_use = true;
} else {
- pr_err("%scookie=NULL\n", prefix);
+ trace_cachefiles_mark_failed(object, inode);
+ pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+ dentry, inode->i_ino);
}
- spin_unlock(&object->fscache.lock);
+
+ return can_use;
}
-/*
- * dump debugging info about a pair of objects
- */
-static noinline void cachefiles_printk_object(struct cachefiles_object *object,
- struct cachefiles_object *xobject)
+static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
{
- if (object)
- __cachefiles_printk_object(object, "");
- if (xobject)
- __cachefiles_printk_object(xobject, "x");
+ struct inode *inode = d_backing_inode(dentry);
+ bool can_use;
+
+ inode_lock(inode);
+ can_use = __cachefiles_mark_inode_in_use(object, dentry);
+ inode_unlock(inode);
+ return can_use;
}
/*
- * mark the owner of a dentry, if there is one, to indicate that that dentry
- * has been preemptively deleted
- * - the caller must hold the i_mutex on the dentry's parent as required to
- * call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ * Unmark a backing inode. The caller must hold the inode lock.
*/
-static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
- struct dentry *dentry,
- enum fscache_why_object_killed why)
+static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
{
- struct cachefiles_object *object;
- struct rb_node *p;
-
- _enter(",'%pd'", dentry);
+ struct inode *inode = d_backing_inode(dentry);
- write_lock(&cache->active_lock);
+ inode->i_flags &= ~S_KERNEL_FILE;
+ trace_cachefiles_mark_inactive(object, inode);
+}
- p = cache->active_nodes.rb_node;
- while (p) {
- object = rb_entry(p, struct cachefiles_object, active_node);
- if (object->dentry > dentry)
- p = p->rb_left;
- else if (object->dentry < dentry)
- p = p->rb_right;
- else
- goto found_dentry;
- }
+static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
+{
+ struct inode *inode = d_backing_inode(dentry);
- write_unlock(&cache->active_lock);
- trace_cachefiles_mark_buried(NULL, dentry, why);
- _leave(" [no owner]");
- return;
+ inode_lock(inode);
+ __cachefiles_unmark_inode_in_use(object, dentry);
+ inode_unlock(inode);
+}
- /* found the dentry for */
-found_dentry:
- kdebug("preemptive burial: OBJ%x [%s] %pd",
- object->fscache.debug_id,
- object->fscache.state->name,
- dentry);
+/*
+ * Unmark a backing inode and tell cachefilesd that there's something that can
+ * be culled.
+ */
+void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+ struct file *file)
+{
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct inode *inode = file_inode(file);
- trace_cachefiles_mark_buried(object, dentry, why);
+ if (inode) {
+ cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry);
- if (fscache_object_is_live(&object->fscache)) {
- pr_err("\n");
- pr_err("Error: Can't preemptively bury live object\n");
- cachefiles_printk_object(object, NULL);
- } else {
- if (why != FSCACHE_OBJECT_IS_STALE)
- fscache_object_mark_killed(&object->fscache, why);
+ if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+ atomic_long_add(inode->i_blocks, &cache->b_released);
+ if (atomic_inc_return(&cache->f_released))
+ cachefiles_state_changed(cache);
+ }
}
-
- write_unlock(&cache->active_lock);
- _leave(" [owner marked]");
}
/*
- * record the fact that an object is now active
+ * get a subdirectory
*/
-static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
- struct cachefiles_object *object)
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ const char *dirname,
+ bool *_is_new)
{
- struct cachefiles_object *xobject;
- struct rb_node **_p, *_parent = NULL;
- struct dentry *dentry;
-
- _enter(",%x", object->fscache.debug_id);
+ struct dentry *subdir;
+ struct path path;
+ int ret;
-try_again:
- write_lock(&cache->active_lock);
+ _enter(",,%s", dirname);
- dentry = object->dentry;
- trace_cachefiles_mark_active(object, dentry);
+ /* search the current directory for the element name */
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
- pr_err("Error: Object already active\n");
- cachefiles_printk_object(object, NULL);
- BUG();
+retry:
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ subdir = lookup_one_len(dirname, dir, strlen(dirname));
+ else
+ subdir = ERR_PTR(ret);
+ trace_cachefiles_lookup(NULL, dir, subdir);
+ if (IS_ERR(subdir)) {
+ trace_cachefiles_vfs_error(NULL, d_backing_inode(dir),
+ PTR_ERR(subdir),
+ cachefiles_trace_lookup_error);
+ if (PTR_ERR(subdir) == -ENOMEM)
+ goto nomem_d_alloc;
+ goto lookup_error;
}
- _p = &cache->active_nodes.rb_node;
- while (*_p) {
- _parent = *_p;
- xobject = rb_entry(_parent,
- struct cachefiles_object, active_node);
+ _debug("subdir -> %pd %s",
+ subdir, d_backing_inode(subdir) ? "positive" : "negative");
+
+ /* we need to create the subdir if it doesn't exist yet */
+ if (d_is_negative(subdir)) {
+ ret = cachefiles_has_space(cache, 1, 0,
+ cachefiles_has_space_for_create);
+ if (ret < 0)
+ goto mkdir_error;
- ASSERT(xobject != object);
+ _debug("attempt mkdir");
- if (xobject->dentry > dentry)
- _p = &(*_p)->rb_left;
- else if (xobject->dentry < dentry)
- _p = &(*_p)->rb_right;
- else
- goto wait_for_old_object;
+ path.mnt = cache->mnt;
+ path.dentry = dir;
+ ret = security_path_mkdir(&path, subdir, 0700);
+ if (ret < 0)
+ goto mkdir_error;
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700);
+ if (ret < 0) {
+ trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
+ cachefiles_trace_mkdir_error);
+ goto mkdir_error;
+ }
+ trace_cachefiles_mkdir(dir, subdir);
+
+ if (unlikely(d_unhashed(subdir))) {
+ cachefiles_put_directory(subdir);
+ goto retry;
+ }
+ ASSERT(d_backing_inode(subdir));
+
+ _debug("mkdir -> %pd{ino=%lu}",
+ subdir, d_backing_inode(subdir)->i_ino);
+ if (_is_new)
+ *_is_new = true;
}
- rb_link_node(&object->active_node, _parent, _p);
- rb_insert_color(&object->active_node, &cache->active_nodes);
+ /* Tell rmdir() it's not allowed to delete the subdir */
+ inode_lock(d_inode(subdir));
+ inode_unlock(d_inode(dir));
- write_unlock(&cache->active_lock);
- _leave(" = 0");
- return 0;
+ if (!__cachefiles_mark_inode_in_use(NULL, subdir))
+ goto mark_error;
- /* an old object from a previous incarnation is hogging the slot - we
- * need to wait for it to be destroyed */
-wait_for_old_object:
- trace_cachefiles_wait_active(object, dentry, xobject);
- clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
-
- if (fscache_object_is_live(&xobject->fscache)) {
- pr_err("\n");
- pr_err("Error: Unexpected object collision\n");
- cachefiles_printk_object(object, xobject);
- }
- atomic_inc(&xobject->usage);
- write_unlock(&cache->active_lock);
-
- if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
- wait_queue_head_t *wq;
-
- signed long timeout = 60 * HZ;
- wait_queue_entry_t wait;
- bool requeue;
-
- /* if the object we're waiting for is queued for processing,
- * then just put ourselves on the queue behind it */
- if (work_pending(&xobject->fscache.work)) {
- _debug("queue OBJ%x behind OBJ%x immediately",
- object->fscache.debug_id,
- xobject->fscache.debug_id);
- goto requeue;
- }
+ inode_unlock(d_inode(subdir));
- /* otherwise we sleep until either the object we're waiting for
- * is done, or the fscache_object is congested */
- wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
- init_wait(&wait);
- requeue = false;
- do {
- prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
- if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
- break;
-
- requeue = fscache_object_sleep_till_congested(&timeout);
- } while (timeout > 0 && !requeue);
- finish_wait(wq, &wait);
-
- if (requeue &&
- test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
- _debug("queue OBJ%x behind OBJ%x after wait",
- object->fscache.debug_id,
- xobject->fscache.debug_id);
- goto requeue;
- }
+ /* we need to make sure the subdir is a directory */
+ ASSERT(d_backing_inode(subdir));
- if (timeout <= 0) {
- pr_err("\n");
- pr_err("Error: Overlong wait for old active object to go away\n");
- cachefiles_printk_object(object, xobject);
- goto requeue;
- }
+ if (!d_can_lookup(subdir)) {
+ pr_err("%s is not a directory\n", dirname);
+ ret = -EIO;
+ goto check_error;
}
- ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
+ ret = -EPERM;
+ if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) ||
+ !d_backing_inode(subdir)->i_op->lookup ||
+ !d_backing_inode(subdir)->i_op->mkdir ||
+ !d_backing_inode(subdir)->i_op->rename ||
+ !d_backing_inode(subdir)->i_op->rmdir ||
+ !d_backing_inode(subdir)->i_op->unlink)
+ goto check_error;
+
+ _leave(" = [%lu]", d_backing_inode(subdir)->i_ino);
+ return subdir;
- cache->cache.ops->put_object(&xobject->fscache,
- (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry);
- goto try_again;
+check_error:
+ cachefiles_put_directory(subdir);
+ _leave(" = %d [check]", ret);
+ return ERR_PTR(ret);
-requeue:
- cache->cache.ops->put_object(&xobject->fscache,
- (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo);
- _leave(" = -ETIMEDOUT");
- return -ETIMEDOUT;
+mark_error:
+ inode_unlock(d_inode(subdir));
+ dput(subdir);
+ return ERR_PTR(-EBUSY);
+
+mkdir_error:
+ inode_unlock(d_inode(dir));
+ dput(subdir);
+ pr_err("mkdir %s failed with error %d\n", dirname, ret);
+ return ERR_PTR(ret);
+
+lookup_error:
+ inode_unlock(d_inode(dir));
+ ret = PTR_ERR(subdir);
+ pr_err("Lookup %s failed with error %d\n", dirname, ret);
+ return ERR_PTR(ret);
+
+nomem_d_alloc:
+ inode_unlock(d_inode(dir));
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
}
/*
- * Mark an object as being inactive.
+ * Put a subdirectory.
*/
-void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object,
- blkcnt_t i_blocks)
+void cachefiles_put_directory(struct dentry *dir)
{
- struct dentry *dentry = object->dentry;
- struct inode *inode = d_backing_inode(dentry);
-
- trace_cachefiles_mark_inactive(object, dentry, inode);
+ if (dir) {
+ inode_lock(dir->d_inode);
+ __cachefiles_unmark_inode_in_use(NULL, dir);
+ inode_unlock(dir->d_inode);
+ dput(dir);
+ }
+}
- write_lock(&cache->active_lock);
- rb_erase(&object->active_node, &cache->active_nodes);
- clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
- write_unlock(&cache->active_lock);
+/*
+ * Remove a regular file from the cache.
+ */
+static int cachefiles_unlink(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
+ struct dentry *dir, struct dentry *dentry,
+ enum fscache_why_object_killed why)
+{
+ struct path path = {
+ .mnt = cache->mnt,
+ .dentry = dir,
+ };
+ int ret;
- wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+ trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why);
+ ret = security_path_unlink(&path, dentry);
+ if (ret < 0) {
+ cachefiles_io_error(cache, "Unlink security error");
+ return ret;
+ }
- /* This object can now be culled, so we need to let the daemon know
- * that there is something it can remove if it needs to.
- */
- atomic_long_add(i_blocks, &cache->b_released);
- if (atomic_inc_return(&cache->f_released))
- cachefiles_state_changed(cache);
+ ret = cachefiles_inject_remove_error();
+ if (ret == 0) {
+ ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL);
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "Unlink failed");
+ }
+ if (ret != 0)
+ trace_cachefiles_vfs_error(object, d_backing_inode(dir), ret,
+ cachefiles_trace_unlink_error);
+ return ret;
}
/*
- * delete an object representation from the cache
- * - file backed objects are unlinked
- * - directory backed objects are stuffed into the graveyard for userspace to
+ * Delete an object representation from the cache
+ * - File backed objects are unlinked
+ * - Directory backed objects are stuffed into the graveyard for userspace to
* delete
- * - unlocks the directory mutex
*/
-static int cachefiles_bury_object(struct cachefiles_cache *cache,
- struct cachefiles_object *object,
- struct dentry *dir,
- struct dentry *rep,
- bool preemptive,
- enum fscache_why_object_killed why)
+int cachefiles_bury_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
+ struct dentry *dir,
+ struct dentry *rep,
+ enum fscache_why_object_killed why)
{
struct dentry *grave, *trap;
struct path path, path_to_graveyard;
@@ -298,29 +283,21 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
_enter(",'%pd','%pd'", dir, rep);
+ if (rep->d_parent != dir) {
+ inode_unlock(d_inode(dir));
+ _leave(" = -ESTALE");
+ return -ESTALE;
+ }
+
/* non-directories can just be unlinked */
if (!d_is_dir(rep)) {
- _debug("unlink stale object");
-
- path.mnt = cache->mnt;
- path.dentry = dir;
- ret = security_path_unlink(&path, rep);
- if (ret < 0) {
- cachefiles_io_error(cache, "Unlink security error");
- } else {
- trace_cachefiles_unlink(object, rep, why);
- ret = vfs_unlink(&init_user_ns, d_inode(dir), rep,
- NULL);
-
- if (preemptive)
- cachefiles_mark_object_buried(cache, rep, why);
- }
+ dget(rep); /* Stop the dentry being negated if it's only pinned
+ * by a file struct.
+ */
+ ret = cachefiles_unlink(cache, object, dir, rep, why);
+ dput(rep);
inode_unlock(d_inode(dir));
-
- if (ret == -EIO)
- cachefiles_io_error(cache, "Unlink failed");
-
_leave(" = %d", ret);
return ret;
}
@@ -368,14 +345,16 @@ try_again:
grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
if (IS_ERR(grave)) {
unlock_rename(cache->graveyard, dir);
+ trace_cachefiles_vfs_error(object, d_inode(cache->graveyard),
+ PTR_ERR(grave),
+ cachefiles_trace_lookup_error);
if (PTR_ERR(grave) == -ENOMEM) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- cachefiles_io_error(cache, "Lookup error %ld",
- PTR_ERR(grave));
+ cachefiles_io_error(cache, "Lookup error %ld", PTR_ERR(grave));
return -EIO;
}
@@ -419,16 +398,19 @@ try_again:
.new_dir = d_inode(cache->graveyard),
.new_dentry = grave,
};
- trace_cachefiles_rename(object, rep, grave, why);
- ret = vfs_rename(&rd);
+ trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ ret = vfs_rename(&rd);
+ if (ret != 0)
+ trace_cachefiles_vfs_error(object, d_inode(dir), ret,
+ cachefiles_trace_rename_error);
if (ret != 0 && ret != -ENOMEM)
cachefiles_io_error(cache,
"Rename failed with error %d", ret);
-
- if (preemptive)
- cachefiles_mark_object_buried(cache, rep, why);
}
+ __cachefiles_unmark_inode_in_use(object, rep);
unlock_rename(cache->graveyard, dir);
dput(grave);
_leave(" = 0");
@@ -436,493 +418,365 @@ try_again:
}
/*
- * delete an object representation from the cache
+ * Delete a cache file.
*/
-int cachefiles_delete_object(struct cachefiles_cache *cache,
- struct cachefiles_object *object)
+int cachefiles_delete_object(struct cachefiles_object *object,
+ enum fscache_why_object_killed why)
{
- struct dentry *dir;
+ struct cachefiles_volume *volume = object->volume;
+ struct dentry *dentry = object->file->f_path.dentry;
+ struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
int ret;
- _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry);
+ _enter(",OBJ%x{%pD}", object->debug_id, object->file);
- ASSERT(object->dentry);
- ASSERT(d_backing_inode(object->dentry));
- ASSERT(object->dentry->d_parent);
+ /* Stop the dentry being negated if it's only pinned by a file struct. */
+ dget(dentry);
- dir = dget_parent(object->dentry);
-
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
-
- if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
- /* object allocation for the same key preemptively deleted this
- * object's file so that it could create its own file */
- _debug("object preemptively buried");
- inode_unlock(d_inode(dir));
- ret = 0;
- } else {
- /* we need to check that our parent is _still_ our parent - it
- * may have been renamed */
- if (dir == object->dentry->d_parent) {
- ret = cachefiles_bury_object(cache, object, dir,
- object->dentry, false,
- FSCACHE_OBJECT_WAS_RETIRED);
- } else {
- /* it got moved, presumably by cachefilesd culling it,
- * so it's no longer in the key path and we can ignore
- * it */
- inode_unlock(d_inode(dir));
- ret = 0;
- }
- }
-
- dput(dir);
- _leave(" = %d", ret);
+ inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT);
+ ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
+ inode_unlock(d_backing_inode(fan));
+ dput(dentry);
return ret;
}
/*
- * walk from the parent object to the child object through the backing
- * filesystem, creating directories as we go
+ * Create a temporary file and leave it unattached and un-xattr'd until the
+ * time comes to discard the object from memory.
*/
-int cachefiles_walk_to_object(struct cachefiles_object *parent,
- struct cachefiles_object *object,
- const char *key,
- struct cachefiles_xattr *auxdata)
+struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
{
- struct cachefiles_cache *cache;
- struct dentry *dir, *next = NULL;
- struct inode *inode;
+ struct cachefiles_volume *volume = object->volume;
+ struct cachefiles_cache *cache = volume->cache;
+ const struct cred *saved_cred;
+ struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
+ struct file *file;
struct path path;
- const char *name;
- int ret, nlen;
-
- _enter("OBJ%x{%pd},OBJ%x,%s,",
- parent->fscache.debug_id, parent->dentry,
- object->fscache.debug_id, key);
-
- cache = container_of(parent->fscache.cache,
- struct cachefiles_cache, cache);
- path.mnt = cache->mnt;
-
- ASSERT(parent->dentry);
- ASSERT(d_backing_inode(parent->dentry));
-
- if (!(d_is_dir(parent->dentry))) {
- // TODO: convert file to dir
- _leave("looking up in none directory");
- return -ENOBUFS;
- }
-
- dir = dget(parent->dentry);
-
-advance:
- /* attempt to transit the first directory component */
- name = key;
- nlen = strlen(key);
-
- /* key ends in a double NUL */
- key = key + nlen + 1;
- if (!*key)
- key = NULL;
-
-lookup_again:
- /* search the current directory for the element name */
- _debug("lookup '%s'", name);
-
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ uint64_t ni_size = object->cookie->object_size;
+ long ret;
- next = lookup_one_len(name, dir, nlen);
- if (IS_ERR(next)) {
- trace_cachefiles_lookup(object, next, NULL);
- goto lookup_error;
- }
+ ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
- inode = d_backing_inode(next);
- trace_cachefiles_lookup(object, next, inode);
- _debug("next -> %pd %s", next, inode ? "positive" : "negative");
-
- if (!key)
- object->new = !inode;
-
- /* if this element of the path doesn't exist, then the lookup phase
- * failed, and we can release any readers in the certain knowledge that
- * there's nothing for them to actually read */
- if (d_is_negative(next))
- fscache_object_lookup_negative(&object->fscache);
-
- /* we need to create the object if it's negative */
- if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
- /* index objects and intervening tree levels must be subdirs */
- if (d_is_negative(next)) {
- ret = cachefiles_has_space(cache, 1, 0);
- if (ret < 0)
- goto no_space_error;
-
- path.dentry = dir;
- ret = security_path_mkdir(&path, next, 0);
- if (ret < 0)
- goto create_error;
- ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0);
- if (!key)
- trace_cachefiles_mkdir(object, next, ret);
- if (ret < 0)
- goto create_error;
-
- if (unlikely(d_unhashed(next))) {
- dput(next);
- inode_unlock(d_inode(dir));
- goto lookup_again;
- }
- ASSERT(d_backing_inode(next));
-
- _debug("mkdir -> %pd{ino=%lu}",
- next, d_backing_inode(next)->i_ino);
-
- } else if (!d_can_lookup(next)) {
- pr_err("inode %lu is not a directory\n",
- d_backing_inode(next)->i_ino);
- ret = -ENOBUFS;
- goto error;
- }
+ cachefiles_begin_secure(cache, &saved_cred);
- } else {
- /* non-index objects start out life as files */
- if (d_is_negative(next)) {
- ret = cachefiles_has_space(cache, 1, 0);
- if (ret < 0)
- goto no_space_error;
-
- path.dentry = dir;
- ret = security_path_mknod(&path, next, S_IFREG, 0);
- if (ret < 0)
- goto create_error;
- ret = vfs_create(&init_user_ns, d_inode(dir), next,
- S_IFREG, true);
- trace_cachefiles_create(object, next, ret);
- if (ret < 0)
- goto create_error;
-
- ASSERT(d_backing_inode(next));
-
- _debug("create -> %pd{ino=%lu}",
- next, d_backing_inode(next)->i_ino);
-
- } else if (!d_can_lookup(next) &&
- !d_is_reg(next)
- ) {
- pr_err("inode %lu is not a file or directory\n",
- d_backing_inode(next)->i_ino);
- ret = -ENOBUFS;
- goto error;
+ path.mnt = cache->mnt;
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR);
+ else
+ path.dentry = ERR_PTR(ret);
+ if (IS_ERR(path.dentry)) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry),
+ cachefiles_trace_tmpfile_error);
+ if (PTR_ERR(path.dentry) == -EIO)
+ cachefiles_io_error_obj(object, "Failed to create tmpfile");
+ file = ERR_CAST(path.dentry);
+ goto out;
+ }
+
+ trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry));
+
+ if (!cachefiles_mark_inode_in_use(object, path.dentry)) {
+ file = ERR_PTR(-EBUSY);
+ goto out_dput;
+ }
+
+ if (ni_size > 0) {
+ trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
+ cachefiles_trunc_expand_tmpfile);
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_truncate(&path, ni_size);
+ if (ret < 0) {
+ trace_cachefiles_vfs_error(
+ object, d_backing_inode(path.dentry), ret,
+ cachefiles_trace_trunc_error);
+ file = ERR_PTR(ret);
+ goto out_unuse;
}
}
- /* process the next component */
- if (key) {
- _debug("advance");
- inode_unlock(d_inode(dir));
- dput(dir);
- dir = next;
- next = NULL;
- goto advance;
+ file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+ d_backing_inode(path.dentry), cache->cache_cred);
+ if (IS_ERR(file)) {
+ trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry),
+ PTR_ERR(file),
+ cachefiles_trace_open_error);
+ goto out_unuse;
+ }
+ if (unlikely(!file->f_op->read_iter) ||
+ unlikely(!file->f_op->write_iter)) {
+ fput(file);
+ pr_notice("Cache does not support read_iter and write_iter\n");
+ file = ERR_PTR(-EINVAL);
+ goto out_unuse;
}
- /* we've found the object we were looking for */
- object->dentry = next;
-
- /* if we've found that the terminal object exists, then we need to
- * check its attributes and delete it if it's out of date */
- if (!object->new) {
- _debug("validate '%pd'", next);
+ goto out_dput;
- ret = cachefiles_check_object_xattr(object, auxdata);
- if (ret == -ESTALE) {
- /* delete the object (the deleter drops the directory
- * mutex) */
- object->dentry = NULL;
+out_unuse:
+ cachefiles_do_unmark_inode_in_use(object, path.dentry);
+out_dput:
+ dput(path.dentry);
+out:
+ cachefiles_end_secure(cache, saved_cred);
+ return file;
+}
- ret = cachefiles_bury_object(cache, object, dir, next,
- true,
- FSCACHE_OBJECT_IS_STALE);
- dput(next);
- next = NULL;
+/*
+ * Create a new file.
+ */
+static bool cachefiles_create_file(struct cachefiles_object *object)
+{
+ struct file *file;
+ int ret;
- if (ret < 0)
- goto delete_error;
+ ret = cachefiles_has_space(object->volume->cache, 1, 0,
+ cachefiles_has_space_for_create);
+ if (ret < 0)
+ return false;
- _debug("redo lookup");
- fscache_object_retrying_stale(&object->fscache);
- goto lookup_again;
- }
- }
+ file = cachefiles_create_tmpfile(object);
+ if (IS_ERR(file))
+ return false;
- /* note that we're now using this object */
- ret = cachefiles_mark_object_active(cache, object);
+ set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags);
+ set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+ _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino);
+ object->file = file;
+ return true;
+}
- inode_unlock(d_inode(dir));
- dput(dir);
- dir = NULL;
+/*
+ * Open an existing file, checking its attributes and replacing it if it is
+ * stale.
+ */
+static bool cachefiles_open_file(struct cachefiles_object *object,
+ struct dentry *dentry)
+{
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct file *file;
+ struct path path;
+ int ret;
- if (ret == -ETIMEDOUT)
- goto mark_active_timed_out;
+ _enter("%pd", dentry);
- _debug("=== OBTAINED_OBJECT ===");
+ if (!cachefiles_mark_inode_in_use(object, dentry))
+ return false;
- if (object->new) {
- /* attach data to a newly constructed terminal object */
- ret = cachefiles_set_object_xattr(object, auxdata);
- if (ret < 0)
- goto check_error;
- } else {
- /* always update the atime on an object we've just looked up
- * (this is used to keep track of culling, and atimes are only
- * updated by read, write and readdir but not lookup or
- * open) */
- path.dentry = next;
- touch_atime(&path);
- }
-
- /* open a file interface onto a data file */
- if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (d_is_reg(object->dentry)) {
- const struct address_space_operations *aops;
-
- ret = -EPERM;
- aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
- if (!aops->bmap)
- goto check_error;
- if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
- goto check_error;
-
- object->backer = object->dentry;
- } else {
- BUG(); // TODO: open file in data-class subdir
- }
+ /* We need to open a file interface onto a data file now as we can't do
+ * it on demand because writeback called from do_exit() sees
+ * current->fs == NULL - which breaks d_path() called from ext4 open.
+ */
+ path.mnt = cache->mnt;
+ path.dentry = dentry;
+ file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+ d_backing_inode(dentry), cache->cache_cred);
+ if (IS_ERR(file)) {
+ trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
+ PTR_ERR(file),
+ cachefiles_trace_open_error);
+ goto error;
}
- object->new = 0;
- fscache_obtained_object(&object->fscache);
-
- _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino);
- return 0;
-
-no_space_error:
- fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE);
-create_error:
- _debug("create error %d", ret);
- if (ret == -EIO)
- cachefiles_io_error(cache, "Create/mkdir failed");
- goto error;
+ if (unlikely(!file->f_op->read_iter) ||
+ unlikely(!file->f_op->write_iter)) {
+ pr_notice("Cache does not support read_iter and write_iter\n");
+ goto error_fput;
+ }
+ _debug("file -> %pd positive", dentry);
-mark_active_timed_out:
- _debug("mark active timed out");
- goto release_dentry;
+ ret = cachefiles_check_auxdata(object, file);
+ if (ret < 0)
+ goto check_failed;
-check_error:
- _debug("check error %d", ret);
- cachefiles_mark_object_inactive(
- cache, object, d_backing_inode(object->dentry)->i_blocks);
-release_dentry:
- dput(object->dentry);
- object->dentry = NULL;
- goto error_out;
-
-delete_error:
- _debug("delete error %d", ret);
- goto error_out2;
+ object->file = file;
-lookup_error:
- _debug("lookup error %ld", PTR_ERR(next));
- ret = PTR_ERR(next);
- if (ret == -EIO)
- cachefiles_io_error(cache, "Lookup failed");
- next = NULL;
+ /* Always update the atime on an object we've just looked up (this is
+ * used to keep track of culling, and atimes are only updated by read,
+ * write and readdir but not lookup or open).
+ */
+ touch_atime(&file->f_path);
+ dput(dentry);
+ return true;
+
+check_failed:
+ fscache_cookie_lookup_negative(object->cookie);
+ cachefiles_unmark_inode_in_use(object, file);
+ fput(file);
+ dput(dentry);
+ if (ret == -ESTALE)
+ return cachefiles_create_file(object);
+ return false;
+
+error_fput:
+ fput(file);
error:
- inode_unlock(d_inode(dir));
- dput(next);
-error_out2:
- dput(dir);
-error_out:
- _leave(" = error %d", -ret);
- return ret;
+ cachefiles_do_unmark_inode_in_use(object, dentry);
+ dput(dentry);
+ return false;
}
/*
- * get a subdirectory
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
*/
-struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
- struct dentry *dir,
- const char *dirname)
+bool cachefiles_look_up_object(struct cachefiles_object *object)
{
- struct dentry *subdir;
- struct path path;
+ struct cachefiles_volume *volume = object->volume;
+ struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash];
int ret;
- _enter(",,%s", dirname);
-
- /* search the current directory for the element name */
- inode_lock(d_inode(dir));
-
-retry:
- subdir = lookup_one_len(dirname, dir, strlen(dirname));
- if (IS_ERR(subdir)) {
- if (PTR_ERR(subdir) == -ENOMEM)
- goto nomem_d_alloc;
- goto lookup_error;
+ _enter("OBJ%x,%s,", object->debug_id, object->d_name);
+
+ /* Look up path "cache/vol/fanout/file". */
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ dentry = lookup_positive_unlocked(object->d_name, fan,
+ object->d_name_len);
+ else
+ dentry = ERR_PTR(ret);
+ trace_cachefiles_lookup(object, fan, dentry);
+ if (IS_ERR(dentry)) {
+ if (dentry == ERR_PTR(-ENOENT))
+ goto new_file;
+ if (dentry == ERR_PTR(-EIO))
+ cachefiles_io_error_obj(object, "Lookup failed");
+ return false;
+ }
+
+ if (!d_is_reg(dentry)) {
+ pr_err("%pd is not a file\n", dentry);
+ inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
+ ret = cachefiles_bury_object(volume->cache, object, fan, dentry,
+ FSCACHE_OBJECT_IS_WEIRD);
+ dput(dentry);
+ if (ret < 0)
+ return false;
+ goto new_file;
}
- _debug("subdir -> %pd %s",
- subdir, d_backing_inode(subdir) ? "positive" : "negative");
+ if (!cachefiles_open_file(object, dentry))
+ return false;
- /* we need to create the subdir if it doesn't exist yet */
- if (d_is_negative(subdir)) {
- ret = cachefiles_has_space(cache, 1, 0);
- if (ret < 0)
- goto mkdir_error;
+ _leave(" = t [%lu]", file_inode(object->file)->i_ino);
+ return true;
- _debug("attempt mkdir");
+new_file:
+ fscache_cookie_lookup_negative(object->cookie);
+ return cachefiles_create_file(object);
+}
- path.mnt = cache->mnt;
- path.dentry = dir;
- ret = security_path_mkdir(&path, subdir, 0700);
- if (ret < 0)
- goto mkdir_error;
- ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700);
- if (ret < 0)
- goto mkdir_error;
+/*
+ * Attempt to link a temporary file into its rightful place in the cache.
+ */
+bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
+{
+ struct cachefiles_volume *volume = object->volume;
+ struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash];
+ bool success = false;
+ int ret;
- if (unlikely(d_unhashed(subdir))) {
- dput(subdir);
- goto retry;
+ _enter(",%pD", object->file);
+
+ inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ else
+ dentry = ERR_PTR(ret);
+ if (IS_ERR(dentry)) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
+ cachefiles_trace_lookup_error);
+ _debug("lookup fail %ld", PTR_ERR(dentry));
+ goto out_unlock;
+ }
+
+ if (!d_is_negative(dentry)) {
+ if (d_backing_inode(dentry) == file_inode(object->file)) {
+ success = true;
+ goto out_dput;
}
- ASSERT(d_backing_inode(subdir));
- _debug("mkdir -> %pd{ino=%lu}",
- subdir, d_backing_inode(subdir)->i_ino);
- }
-
- inode_unlock(d_inode(dir));
-
- /* we need to make sure the subdir is a directory */
- ASSERT(d_backing_inode(subdir));
+ ret = cachefiles_unlink(volume->cache, object, fan, dentry,
+ FSCACHE_OBJECT_IS_STALE);
+ if (ret < 0)
+ goto out_dput;
- if (!d_can_lookup(subdir)) {
- pr_err("%s is not a directory\n", dirname);
- ret = -EIO;
- goto check_error;
+ dput(dentry);
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ else
+ dentry = ERR_PTR(ret);
+ if (IS_ERR(dentry)) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
+ cachefiles_trace_lookup_error);
+ _debug("lookup fail %ld", PTR_ERR(dentry));
+ goto out_unlock;
+ }
}
- ret = -EPERM;
- if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) ||
- !d_backing_inode(subdir)->i_op->lookup ||
- !d_backing_inode(subdir)->i_op->mkdir ||
- !d_backing_inode(subdir)->i_op->create ||
- !d_backing_inode(subdir)->i_op->rename ||
- !d_backing_inode(subdir)->i_op->rmdir ||
- !d_backing_inode(subdir)->i_op->unlink)
- goto check_error;
-
- _leave(" = [%lu]", d_backing_inode(subdir)->i_ino);
- return subdir;
-
-check_error:
- dput(subdir);
- _leave(" = %d [check]", ret);
- return ERR_PTR(ret);
-
-mkdir_error:
- inode_unlock(d_inode(dir));
- dput(subdir);
- pr_err("mkdir %s failed with error %d\n", dirname, ret);
- return ERR_PTR(ret);
-
-lookup_error:
- inode_unlock(d_inode(dir));
- ret = PTR_ERR(subdir);
- pr_err("Lookup %s failed with error %d\n", dirname, ret);
- return ERR_PTR(ret);
-
-nomem_d_alloc:
- inode_unlock(d_inode(dir));
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
+ ret = cachefiles_inject_read_error();
+ if (ret == 0)
+ ret = vfs_link(object->file->f_path.dentry, &init_user_ns,
+ d_inode(fan), dentry, NULL);
+ if (ret < 0) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), ret,
+ cachefiles_trace_link_error);
+ _debug("link fail %d", ret);
+ } else {
+ trace_cachefiles_link(object, file_inode(object->file));
+ spin_lock(&object->lock);
+ /* TODO: Do we want to switch the file pointer to the new dentry? */
+ clear_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+ spin_unlock(&object->lock);
+ success = true;
+ }
+
+out_dput:
+ dput(dentry);
+out_unlock:
+ inode_unlock(d_inode(fan));
+ _leave(" = %u", success);
+ return success;
}
/*
- * find out if an object is in use or not
- * - if finds object and it's not in use:
- * - returns a pointer to the object and a reference on it
- * - returns with the directory locked
+ * Look up an inode to be checked or culled. Return -EBUSY if the inode is
+ * marked in use.
*/
-static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
- struct dentry *dir,
- char *filename)
+static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ char *filename)
{
- struct cachefiles_object *object;
- struct rb_node *_n;
struct dentry *victim;
- int ret;
-
- //_enter(",%pd/,%s",
- // dir, filename);
+ int ret = -ENOENT;
- /* look up the victim */
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
victim = lookup_one_len(filename, dir, strlen(filename));
if (IS_ERR(victim))
goto lookup_error;
-
- //_debug("victim -> %pd %s",
- // victim, d_backing_inode(victim) ? "positive" : "negative");
-
- /* if the object is no longer there then we probably retired the object
- * at the netfs's request whilst the cull was in progress
- */
- if (d_is_negative(victim)) {
- inode_unlock(d_inode(dir));
- dput(victim);
- _leave(" = -ENOENT [absent]");
- return ERR_PTR(-ENOENT);
- }
-
- /* check to see if we're using this object */
- read_lock(&cache->active_lock);
-
- _n = cache->active_nodes.rb_node;
-
- while (_n) {
- object = rb_entry(_n, struct cachefiles_object, active_node);
-
- if (object->dentry > victim)
- _n = _n->rb_left;
- else if (object->dentry < victim)
- _n = _n->rb_right;
- else
- goto object_in_use;
- }
-
- read_unlock(&cache->active_lock);
-
- //_leave(" = %pd", victim);
+ if (d_is_negative(victim))
+ goto lookup_put;
+ if (d_inode(victim)->i_flags & S_KERNEL_FILE)
+ goto lookup_busy;
return victim;
-object_in_use:
- read_unlock(&cache->active_lock);
+lookup_busy:
+ ret = -EBUSY;
+lookup_put:
inode_unlock(d_inode(dir));
dput(victim);
- //_leave(" = -EBUSY [in use]");
- return ERR_PTR(-EBUSY);
+ return ERR_PTR(ret);
lookup_error:
inode_unlock(d_inode(dir));
ret = PTR_ERR(victim);
- if (ret == -ENOENT) {
- /* file or dir now absent - probably retired by netfs */
- _leave(" = -ESTALE [absent]");
- return ERR_PTR(-ESTALE);
- }
+ if (ret == -ENOENT)
+ return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */
if (ret == -EIO) {
cachefiles_io_error(cache, "Lookup failed");
@@ -931,46 +785,46 @@ lookup_error:
ret = -EIO;
}
- _leave(" = %d", ret);
return ERR_PTR(ret);
}
/*
- * cull an object if it's not in use
+ * Cull an object if it's not in use
* - called only by cache manager daemon
*/
int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
char *filename)
{
struct dentry *victim;
+ struct inode *inode;
int ret;
_enter(",%pd/,%s", dir, filename);
- victim = cachefiles_check_active(cache, dir, filename);
+ victim = cachefiles_lookup_for_cull(cache, dir, filename);
if (IS_ERR(victim))
return PTR_ERR(victim);
- _debug("victim -> %pd %s",
- victim, d_backing_inode(victim) ? "positive" : "negative");
-
- /* okay... the victim is not being used so we can cull it
- * - start by marking it as stale
- */
- _debug("victim is cullable");
-
- ret = cachefiles_remove_object_xattr(cache, victim);
+ /* check to see if someone is using this object */
+ inode = d_inode(victim);
+ inode_lock(inode);
+ if (inode->i_flags & S_KERNEL_FILE) {
+ ret = -EBUSY;
+ } else {
+ /* Stop the cache from picking it back up */
+ inode->i_flags |= S_KERNEL_FILE;
+ ret = 0;
+ }
+ inode_unlock(inode);
if (ret < 0)
goto error_unlock;
- /* actually remove the victim (drops the dir mutex) */
- _debug("bury");
-
- ret = cachefiles_bury_object(cache, NULL, dir, victim, false,
+ ret = cachefiles_bury_object(cache, NULL, dir, victim,
FSCACHE_OBJECT_WAS_CULLED);
if (ret < 0)
goto error;
+ fscache_count_culled();
dput(victim);
_leave(" = 0");
return 0;
@@ -979,11 +833,8 @@ error_unlock:
inode_unlock(d_inode(dir));
error:
dput(victim);
- if (ret == -ENOENT) {
- /* file or dir now absent - probably retired by netfs */
- _leave(" = -ESTALE [absent]");
- return -ESTALE;
- }
+ if (ret == -ENOENT)
+ return -ESTALE; /* Probably got retired by the netfs */
if (ret != -ENOMEM) {
pr_err("Internal error: %d\n", ret);
@@ -995,7 +846,7 @@ error:
}
/*
- * find out if an object is in use or not
+ * Find out if an object is in use or not
* - called only by cache manager daemon
* - returns -EBUSY or 0 to indicate whether an object is in use or not
*/
@@ -1003,16 +854,13 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
char *filename)
{
struct dentry *victim;
+ int ret = 0;
- //_enter(",%pd/,%s",
- // dir, filename);
-
- victim = cachefiles_check_active(cache, dir, filename);
+ victim = cachefiles_lookup_for_cull(cache, dir, filename);
if (IS_ERR(victim))
return PTR_ERR(victim);
inode_unlock(d_inode(dir));
dput(victim);
- //_leave(" = 0");
- return 0;
+ return ret;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
deleted file mode 100644
index 8ffc40e84a59..000000000000
--- a/fs/cachefiles/rdwr.c
+++ /dev/null
@@ -1,972 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Storage object read/write
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/swap.h>
-#include "internal.h"
-
-/*
- * detect wake up events generated by the unlocking of pages in which we're
- * interested
- * - we use this to detect read completion of backing pages
- * - the caller holds the waitqueue lock
- */
-static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
- int sync, void *_key)
-{
- struct cachefiles_one_read *monitor =
- container_of(wait, struct cachefiles_one_read, monitor);
- struct cachefiles_object *object;
- struct fscache_retrieval *op = monitor->op;
- struct wait_page_key *key = _key;
- struct page *page = wait->private;
-
- ASSERT(key);
-
- _enter("{%lu},%u,%d,{%p,%u}",
- monitor->netfs_page->index, mode, sync,
- key->page, key->bit_nr);
-
- if (key->page != page || key->bit_nr != PG_locked)
- return 0;
-
- _debug("--- monitor %p %lx ---", page, page->flags);
-
- if (!PageUptodate(page) && !PageError(page)) {
- /* unlocked, not uptodate and not erronous? */
- _debug("page probably truncated");
- }
-
- /* remove from the waitqueue */
- list_del(&wait->entry);
-
- /* move onto the action list and queue for FS-Cache thread pool */
- ASSERT(op);
-
- /* We need to temporarily bump the usage count as we don't own a ref
- * here otherwise cachefiles_read_copier() may free the op between the
- * monitor being enqueued on the op->to_do list and the op getting
- * enqueued on the work queue.
- */
- fscache_get_retrieval(op);
-
- object = container_of(op->op.object, struct cachefiles_object, fscache);
- spin_lock(&object->work_lock);
- list_add_tail(&monitor->op_link, &op->to_do);
- fscache_enqueue_retrieval(op);
- spin_unlock(&object->work_lock);
-
- fscache_put_retrieval(op);
- return 0;
-}
-
-/*
- * handle a probably truncated page
- * - check to see if the page is still relevant and reissue the read if
- * possible
- * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
- * must wait again and 0 if successful
- */
-static int cachefiles_read_reissue(struct cachefiles_object *object,
- struct cachefiles_one_read *monitor)
-{
- struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
- struct page *backpage = monitor->back_page, *backpage2;
- int ret;
-
- _enter("{ino=%lx},{%lx,%lx}",
- d_backing_inode(object->backer)->i_ino,
- backpage->index, backpage->flags);
-
- /* skip if the page was truncated away completely */
- if (backpage->mapping != bmapping) {
- _leave(" = -ENODATA [mapping]");
- return -ENODATA;
- }
-
- backpage2 = find_get_page(bmapping, backpage->index);
- if (!backpage2) {
- _leave(" = -ENODATA [gone]");
- return -ENODATA;
- }
-
- if (backpage != backpage2) {
- put_page(backpage2);
- _leave(" = -ENODATA [different]");
- return -ENODATA;
- }
-
- /* the page is still there and we already have a ref on it, so we don't
- * need a second */
- put_page(backpage2);
-
- INIT_LIST_HEAD(&monitor->op_link);
- add_page_wait_queue(backpage, &monitor->monitor);
-
- if (trylock_page(backpage)) {
- ret = -EIO;
- if (PageError(backpage))
- goto unlock_discard;
- ret = 0;
- if (PageUptodate(backpage))
- goto unlock_discard;
-
- _debug("reissue read");
- ret = bmapping->a_ops->readpage(NULL, backpage);
- if (ret < 0)
- goto discard;
- }
-
- /* but the page may have been read before the monitor was installed, so
- * the monitor may miss the event - so we have to ensure that we do get
- * one in such a case */
- if (trylock_page(backpage)) {
- _debug("jumpstart %p {%lx}", backpage, backpage->flags);
- unlock_page(backpage);
- }
-
- /* it'll reappear on the todo list */
- _leave(" = -EINPROGRESS");
- return -EINPROGRESS;
-
-unlock_discard:
- unlock_page(backpage);
-discard:
- spin_lock_irq(&object->work_lock);
- list_del(&monitor->op_link);
- spin_unlock_irq(&object->work_lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * copy data from backing pages to netfs pages to complete a read operation
- * - driven by FS-Cache's thread pool
- */
-static void cachefiles_read_copier(struct fscache_operation *_op)
-{
- struct cachefiles_one_read *monitor;
- struct cachefiles_object *object;
- struct fscache_retrieval *op;
- int error, max;
-
- op = container_of(_op, struct fscache_retrieval, op);
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
-
- _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino);
-
- max = 8;
- spin_lock_irq(&object->work_lock);
-
- while (!list_empty(&op->to_do)) {
- monitor = list_entry(op->to_do.next,
- struct cachefiles_one_read, op_link);
- list_del(&monitor->op_link);
-
- spin_unlock_irq(&object->work_lock);
-
- _debug("- copy {%lu}", monitor->back_page->index);
-
- recheck:
- if (test_bit(FSCACHE_COOKIE_INVALIDATING,
- &object->fscache.cookie->flags)) {
- error = -ESTALE;
- } else if (PageUptodate(monitor->back_page)) {
- copy_highpage(monitor->netfs_page, monitor->back_page);
- fscache_mark_page_cached(monitor->op,
- monitor->netfs_page);
- error = 0;
- } else if (!PageError(monitor->back_page)) {
- /* the page has probably been truncated */
- error = cachefiles_read_reissue(object, monitor);
- if (error == -EINPROGRESS)
- goto next;
- goto recheck;
- } else {
- cachefiles_io_error_obj(
- object,
- "Readpage failed on backing file %lx",
- (unsigned long) monitor->back_page->flags);
- error = -EIO;
- }
-
- put_page(monitor->back_page);
-
- fscache_end_io(op, monitor->netfs_page, error);
- put_page(monitor->netfs_page);
- fscache_retrieval_complete(op, 1);
- fscache_put_retrieval(op);
- kfree(monitor);
-
- next:
- /* let the thread pool have some air occasionally */
- max--;
- if (max < 0 || need_resched()) {
- if (!list_empty(&op->to_do))
- fscache_enqueue_retrieval(op);
- _leave(" [maxed out]");
- return;
- }
-
- spin_lock_irq(&object->work_lock);
- }
-
- spin_unlock_irq(&object->work_lock);
- _leave("");
-}
-
-/*
- * read the corresponding page to the given set from the backing file
- * - an uncertain page is simply discarded, to be tried again another time
- */
-static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
- struct fscache_retrieval *op,
- struct page *netpage)
-{
- struct cachefiles_one_read *monitor;
- struct address_space *bmapping;
- struct page *newpage, *backpage;
- int ret;
-
- _enter("");
-
- _debug("read back %p{%lu,%d}",
- netpage, netpage->index, page_count(netpage));
-
- monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
- if (!monitor)
- goto nomem;
-
- monitor->netfs_page = netpage;
- monitor->op = fscache_get_retrieval(op);
-
- init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
-
- /* attempt to get hold of the backing page */
- bmapping = d_backing_inode(object->backer)->i_mapping;
- newpage = NULL;
-
- for (;;) {
- backpage = find_get_page(bmapping, netpage->index);
- if (backpage)
- goto backing_page_already_present;
-
- if (!newpage) {
- newpage = __page_cache_alloc(cachefiles_gfp);
- if (!newpage)
- goto nomem_monitor;
- }
-
- ret = add_to_page_cache_lru(newpage, bmapping,
- netpage->index, cachefiles_gfp);
- if (ret == 0)
- goto installed_new_backing_page;
- if (ret != -EEXIST)
- goto nomem_page;
- }
-
- /* we've installed a new backing page, so now we need to start
- * it reading */
-installed_new_backing_page:
- _debug("- new %p", newpage);
-
- backpage = newpage;
- newpage = NULL;
-
-read_backing_page:
- ret = bmapping->a_ops->readpage(NULL, backpage);
- if (ret < 0)
- goto read_error;
-
- /* set the monitor to transfer the data across */
-monitor_backing_page:
- _debug("- monitor add");
-
- /* install the monitor */
- get_page(monitor->netfs_page);
- get_page(backpage);
- monitor->back_page = backpage;
- monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
- monitor = NULL;
-
- /* but the page may have been read before the monitor was installed, so
- * the monitor may miss the event - so we have to ensure that we do get
- * one in such a case */
- if (trylock_page(backpage)) {
- _debug("jumpstart %p {%lx}", backpage, backpage->flags);
- unlock_page(backpage);
- }
- goto success;
-
- /* if the backing page is already present, it can be in one of
- * three states: read in progress, read failed or read okay */
-backing_page_already_present:
- _debug("- present");
-
- if (newpage) {
- put_page(newpage);
- newpage = NULL;
- }
-
- if (PageError(backpage))
- goto io_error;
-
- if (PageUptodate(backpage))
- goto backing_page_already_uptodate;
-
- if (!trylock_page(backpage))
- goto monitor_backing_page;
- _debug("read %p {%lx}", backpage, backpage->flags);
- goto read_backing_page;
-
- /* the backing page is already up to date, attach the netfs
- * page to the pagecache and LRU and copy the data across */
-backing_page_already_uptodate:
- _debug("- uptodate");
-
- fscache_mark_page_cached(op, netpage);
-
- copy_highpage(netpage, backpage);
- fscache_end_io(op, netpage, 0);
- fscache_retrieval_complete(op, 1);
-
-success:
- _debug("success");
- ret = 0;
-
-out:
- if (backpage)
- put_page(backpage);
- if (monitor) {
- fscache_put_retrieval(monitor->op);
- kfree(monitor);
- }
- _leave(" = %d", ret);
- return ret;
-
-read_error:
- _debug("read error %d", ret);
- if (ret == -ENOMEM) {
- fscache_retrieval_complete(op, 1);
- goto out;
- }
-io_error:
- cachefiles_io_error_obj(object, "Page read error on backing file");
- fscache_retrieval_complete(op, 1);
- ret = -ENOBUFS;
- goto out;
-
-nomem_page:
- put_page(newpage);
-nomem_monitor:
- fscache_put_retrieval(monitor->op);
- kfree(monitor);
-nomem:
- fscache_retrieval_complete(op, 1);
- _leave(" = -ENOMEM");
- return -ENOMEM;
-}
-
-/*
- * read a page from the cache or allocate a block in which to store it
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if no buffers can be made available
- * - returns -ENOBUFS if page is beyond EOF
- * - if the page is backed by a block in the cache:
- * - a read will be started which will call the callback on completion
- * - 0 will be returned
- * - else if the page is unbacked:
- * - the metadata will be retained
- * - -ENODATA will be returned
- */
-int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
- struct page *page,
- gfp_t gfp)
-{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct inode *inode;
- sector_t block;
- unsigned shift;
- int ret, ret2;
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- _enter("{%p},{%lx},,,", object, page->index);
-
- if (!object->backer)
- goto enobufs;
-
- inode = d_backing_inode(object->backer);
- ASSERT(S_ISREG(inode->i_mode));
-
- /* calculate the shift required to use bmap */
- shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
-
- op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
- op->op.flags |= FSCACHE_OP_ASYNC;
- op->op.processor = cachefiles_read_copier;
-
- /* we assume the absence or presence of the first block is a good
- * enough indication for the page as a whole
- * - TODO: don't use bmap() for this as it is _not_ actually good
- * enough for this as it doesn't indicate errors, but it's all we've
- * got for the moment
- */
- block = page->index;
- block <<= shift;
-
- ret2 = bmap(inode, &block);
- ASSERT(ret2 == 0);
-
- _debug("%llx -> %llx",
- (unsigned long long) (page->index << shift),
- (unsigned long long) block);
-
- if (block) {
- /* submit the apparently valid page to the backing fs to be
- * read from disk */
- ret = cachefiles_read_backing_file_one(object, op, page);
- } else if (cachefiles_has_space(cache, 0, 1) == 0) {
- /* there's space in the cache we can use */
- fscache_mark_page_cached(op, page);
- fscache_retrieval_complete(op, 1);
- ret = -ENODATA;
- } else {
- goto enobufs;
- }
-
- _leave(" = %d", ret);
- return ret;
-
-enobufs:
- fscache_retrieval_complete(op, 1);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
-}
-
-/*
- * read the corresponding pages to the given set from the backing file
- * - any uncertain pages are simply discarded, to be tried again another time
- */
-static int cachefiles_read_backing_file(struct cachefiles_object *object,
- struct fscache_retrieval *op,
- struct list_head *list)
-{
- struct cachefiles_one_read *monitor = NULL;
- struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
- struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
- int ret = 0;
-
- _enter("");
-
- list_for_each_entry_safe(netpage, _n, list, lru) {
- list_del(&netpage->lru);
-
- _debug("read back %p{%lu,%d}",
- netpage, netpage->index, page_count(netpage));
-
- if (!monitor) {
- monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
- if (!monitor)
- goto nomem;
-
- monitor->op = fscache_get_retrieval(op);
- init_waitqueue_func_entry(&monitor->monitor,
- cachefiles_read_waiter);
- }
-
- for (;;) {
- backpage = find_get_page(bmapping, netpage->index);
- if (backpage)
- goto backing_page_already_present;
-
- if (!newpage) {
- newpage = __page_cache_alloc(cachefiles_gfp);
- if (!newpage)
- goto nomem;
- }
-
- ret = add_to_page_cache_lru(newpage, bmapping,
- netpage->index,
- cachefiles_gfp);
- if (ret == 0)
- goto installed_new_backing_page;
- if (ret != -EEXIST)
- goto nomem;
- }
-
- /* we've installed a new backing page, so now we need
- * to start it reading */
- installed_new_backing_page:
- _debug("- new %p", newpage);
-
- backpage = newpage;
- newpage = NULL;
-
- reread_backing_page:
- ret = bmapping->a_ops->readpage(NULL, backpage);
- if (ret < 0)
- goto read_error;
-
- /* add the netfs page to the pagecache and LRU, and set the
- * monitor to transfer the data across */
- monitor_backing_page:
- _debug("- monitor add");
-
- ret = add_to_page_cache_lru(netpage, op->mapping,
- netpage->index, cachefiles_gfp);
- if (ret < 0) {
- if (ret == -EEXIST) {
- put_page(backpage);
- backpage = NULL;
- put_page(netpage);
- netpage = NULL;
- fscache_retrieval_complete(op, 1);
- continue;
- }
- goto nomem;
- }
-
- /* install a monitor */
- get_page(netpage);
- monitor->netfs_page = netpage;
-
- get_page(backpage);
- monitor->back_page = backpage;
- monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
- monitor = NULL;
-
- /* but the page may have been read before the monitor was
- * installed, so the monitor may miss the event - so we have to
- * ensure that we do get one in such a case */
- if (trylock_page(backpage)) {
- _debug("2unlock %p {%lx}", backpage, backpage->flags);
- unlock_page(backpage);
- }
-
- put_page(backpage);
- backpage = NULL;
-
- put_page(netpage);
- netpage = NULL;
- continue;
-
- /* if the backing page is already present, it can be in one of
- * three states: read in progress, read failed or read okay */
- backing_page_already_present:
- _debug("- present %p", backpage);
-
- if (PageError(backpage))
- goto io_error;
-
- if (PageUptodate(backpage))
- goto backing_page_already_uptodate;
-
- _debug("- not ready %p{%lx}", backpage, backpage->flags);
-
- if (!trylock_page(backpage))
- goto monitor_backing_page;
-
- if (PageError(backpage)) {
- _debug("error %lx", backpage->flags);
- unlock_page(backpage);
- goto io_error;
- }
-
- if (PageUptodate(backpage))
- goto backing_page_already_uptodate_unlock;
-
- /* we've locked a page that's neither up to date nor erroneous,
- * so we need to attempt to read it again */
- goto reread_backing_page;
-
- /* the backing page is already up to date, attach the netfs
- * page to the pagecache and LRU and copy the data across */
- backing_page_already_uptodate_unlock:
- _debug("uptodate %lx", backpage->flags);
- unlock_page(backpage);
- backing_page_already_uptodate:
- _debug("- uptodate");
-
- ret = add_to_page_cache_lru(netpage, op->mapping,
- netpage->index, cachefiles_gfp);
- if (ret < 0) {
- if (ret == -EEXIST) {
- put_page(backpage);
- backpage = NULL;
- put_page(netpage);
- netpage = NULL;
- fscache_retrieval_complete(op, 1);
- continue;
- }
- goto nomem;
- }
-
- copy_highpage(netpage, backpage);
-
- put_page(backpage);
- backpage = NULL;
-
- fscache_mark_page_cached(op, netpage);
-
- /* the netpage is unlocked and marked up to date here */
- fscache_end_io(op, netpage, 0);
- put_page(netpage);
- netpage = NULL;
- fscache_retrieval_complete(op, 1);
- continue;
- }
-
- netpage = NULL;
-
- _debug("out");
-
-out:
- /* tidy up */
- if (newpage)
- put_page(newpage);
- if (netpage)
- put_page(netpage);
- if (backpage)
- put_page(backpage);
- if (monitor) {
- fscache_put_retrieval(op);
- kfree(monitor);
- }
-
- list_for_each_entry_safe(netpage, _n, list, lru) {
- list_del(&netpage->lru);
- put_page(netpage);
- fscache_retrieval_complete(op, 1);
- }
-
- _leave(" = %d", ret);
- return ret;
-
-nomem:
- _debug("nomem");
- ret = -ENOMEM;
- goto record_page_complete;
-
-read_error:
- _debug("read error %d", ret);
- if (ret == -ENOMEM)
- goto record_page_complete;
-io_error:
- cachefiles_io_error_obj(object, "Page read error on backing file");
- ret = -ENOBUFS;
-record_page_complete:
- fscache_retrieval_complete(op, 1);
- goto out;
-}
-
-/*
- * read a list of pages from the cache or allocate blocks in which to store
- * them
- */
-int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
- struct list_head *pages,
- unsigned *nr_pages,
- gfp_t gfp)
-{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct list_head backpages;
- struct pagevec pagevec;
- struct inode *inode;
- struct page *page, *_n;
- unsigned shift, nrbackpages;
- int ret, ret2, space;
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- _enter("{OBJ%x,%d},,%d,,",
- object->fscache.debug_id, atomic_read(&op->op.usage),
- *nr_pages);
-
- if (!object->backer)
- goto all_enobufs;
-
- space = 1;
- if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
- space = 0;
-
- inode = d_backing_inode(object->backer);
- ASSERT(S_ISREG(inode->i_mode));
-
- /* calculate the shift required to use bmap */
- shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
-
- pagevec_init(&pagevec);
-
- op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
- op->op.flags |= FSCACHE_OP_ASYNC;
- op->op.processor = cachefiles_read_copier;
-
- INIT_LIST_HEAD(&backpages);
- nrbackpages = 0;
-
- ret = space ? -ENODATA : -ENOBUFS;
- list_for_each_entry_safe(page, _n, pages, lru) {
- sector_t block;
-
- /* we assume the absence or presence of the first block is a
- * good enough indication for the page as a whole
- * - TODO: don't use bmap() for this as it is _not_ actually
- * good enough for this as it doesn't indicate errors, but
- * it's all we've got for the moment
- */
- block = page->index;
- block <<= shift;
-
- ret2 = bmap(inode, &block);
- ASSERT(ret2 == 0);
-
- _debug("%llx -> %llx",
- (unsigned long long) (page->index << shift),
- (unsigned long long) block);
-
- if (block) {
- /* we have data - add it to the list to give to the
- * backing fs */
- list_move(&page->lru, &backpages);
- (*nr_pages)--;
- nrbackpages++;
- } else if (space && pagevec_add(&pagevec, page) == 0) {
- fscache_mark_pages_cached(op, &pagevec);
- fscache_retrieval_complete(op, 1);
- ret = -ENODATA;
- } else {
- fscache_retrieval_complete(op, 1);
- }
- }
-
- if (pagevec_count(&pagevec) > 0)
- fscache_mark_pages_cached(op, &pagevec);
-
- if (list_empty(pages))
- ret = 0;
-
- /* submit the apparently valid pages to the backing fs to be read from
- * disk */
- if (nrbackpages > 0) {
- ret2 = cachefiles_read_backing_file(object, op, &backpages);
- if (ret2 == -ENOMEM || ret2 == -EINTR)
- ret = ret2;
- }
-
- _leave(" = %d [nr=%u%s]",
- ret, *nr_pages, list_empty(pages) ? " empty" : "");
- return ret;
-
-all_enobufs:
- fscache_retrieval_complete(op, *nr_pages);
- return -ENOBUFS;
-}
-
-/*
- * allocate a block in the cache in which to store a page
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if no buffers can be made available
- * - returns -ENOBUFS if page is beyond EOF
- * - otherwise:
- * - the metadata will be retained
- * - 0 will be returned
- */
-int cachefiles_allocate_page(struct fscache_retrieval *op,
- struct page *page,
- gfp_t gfp)
-{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- int ret;
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- _enter("%p,{%lx},", object, page->index);
-
- ret = cachefiles_has_space(cache, 0, 1);
- if (ret == 0)
- fscache_mark_page_cached(op, page);
- else
- ret = -ENOBUFS;
-
- fscache_retrieval_complete(op, 1);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * allocate blocks in the cache in which to store a set of pages
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if some buffers couldn't be made available
- * - returns -ENOBUFS if some pages are beyond EOF
- * - otherwise:
- * - -ENODATA will be returned
- * - metadata will be retained for any page marked
- */
-int cachefiles_allocate_pages(struct fscache_retrieval *op,
- struct list_head *pages,
- unsigned *nr_pages,
- gfp_t gfp)
-{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct pagevec pagevec;
- struct page *page;
- int ret;
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- _enter("%p,,,%d,", object, *nr_pages);
-
- ret = cachefiles_has_space(cache, 0, *nr_pages);
- if (ret == 0) {
- pagevec_init(&pagevec);
-
- list_for_each_entry(page, pages, lru) {
- if (pagevec_add(&pagevec, page) == 0)
- fscache_mark_pages_cached(op, &pagevec);
- }
-
- if (pagevec_count(&pagevec) > 0)
- fscache_mark_pages_cached(op, &pagevec);
- ret = -ENODATA;
- } else {
- ret = -ENOBUFS;
- }
-
- fscache_retrieval_complete(op, *nr_pages);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * request a page be stored in the cache
- * - cache withdrawal is prevented by the caller
- * - this request may be ignored if there's no cache block available, in which
- * case -ENOBUFS will be returned
- * - if the op is in progress, 0 will be returned
- */
-int cachefiles_write_page(struct fscache_storage *op, struct page *page)
-{
- struct cachefiles_object *object;
- struct cachefiles_cache *cache;
- struct file *file;
- struct path path;
- loff_t pos, eof;
- size_t len;
- void *data;
- int ret = -ENOBUFS;
-
- ASSERT(op != NULL);
- ASSERT(page != NULL);
-
- object = container_of(op->op.object,
- struct cachefiles_object, fscache);
-
- _enter("%p,%p{%lx},,,", object, page, page->index);
-
- if (!object->backer) {
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
- }
-
- ASSERT(d_is_reg(object->backer));
-
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
-
- pos = (loff_t)page->index << PAGE_SHIFT;
-
- /* We mustn't write more data than we have, so we have to beware of a
- * partial page at EOF.
- */
- eof = object->fscache.store_limit_l;
- if (pos >= eof)
- goto error;
-
- /* write the page to the backing filesystem and let it store it in its
- * own time */
- path.mnt = cache->mnt;
- path.dentry = object->backer;
- file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto error_2;
- }
-
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
- }
-
- data = kmap(page);
- ret = kernel_write(file, data, len, &pos);
- kunmap(page);
- fput(file);
- if (ret != len)
- goto error_eio;
-
- _leave(" = 0");
- return 0;
-
-error_eio:
- ret = -EIO;
-error_2:
- if (ret == -EIO)
- cachefiles_io_error_obj(object,
- "Write page to backing file failed");
-error:
- _leave(" = -ENOBUFS [%d]", ret);
- return -ENOBUFS;
-}
-
-/*
- * detach a backing block from a page
- * - cache withdrawal is prevented by the caller
- */
-void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
- __releases(&object->fscache.cookie->lock)
-{
- struct cachefiles_object *object;
-
- object = container_of(_object, struct cachefiles_object, fscache);
-
- _enter("%p,{%lu}", object, page->index);
-
- spin_unlock(&object->fscache.cookie->lock);
-}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index aec13fd94692..fe777164f1d8 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* CacheFiles security management
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
new file mode 100644
index 000000000000..89df0ba8ba5e
--- /dev/null
+++ b/fs/cachefiles/volume.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Volume handling.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "internal.h"
+#include <trace/events/fscache.h>
+
+/*
+ * Allocate and set up a volume representation. We make sure all the fanout
+ * directories are created and pinned.
+ */
+void cachefiles_acquire_volume(struct fscache_volume *vcookie)
+{
+ struct cachefiles_volume *volume;
+ struct cachefiles_cache *cache = vcookie->cache->cache_priv;
+ const struct cred *saved_cred;
+ struct dentry *vdentry, *fan;
+ size_t len;
+ char *name;
+ bool is_new = false;
+ int ret, n_accesses, i;
+
+ _enter("");
+
+ volume = kzalloc(sizeof(struct cachefiles_volume), GFP_KERNEL);
+ if (!volume)
+ return;
+ volume->vcookie = vcookie;
+ volume->cache = cache;
+ INIT_LIST_HEAD(&volume->cache_link);
+
+ cachefiles_begin_secure(cache, &saved_cred);
+
+ len = vcookie->key[0];
+ name = kmalloc(len + 3, GFP_NOFS);
+ if (!name)
+ goto error_vol;
+ name[0] = 'I';
+ memcpy(name + 1, vcookie->key + 1, len);
+ name[len + 1] = 0;
+
+retry:
+ vdentry = cachefiles_get_directory(cache, cache->store, name, &is_new);
+ if (IS_ERR(vdentry))
+ goto error_name;
+ volume->dentry = vdentry;
+
+ if (is_new) {
+ if (!cachefiles_set_volume_xattr(volume))
+ goto error_dir;
+ } else {
+ ret = cachefiles_check_volume_xattr(volume);
+ if (ret < 0) {
+ if (ret != -ESTALE)
+ goto error_dir;
+ inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT);
+ cachefiles_bury_object(cache, NULL, cache->store, vdentry,
+ FSCACHE_VOLUME_IS_WEIRD);
+ cachefiles_put_directory(volume->dentry);
+ cond_resched();
+ goto retry;
+ }
+ }
+
+ for (i = 0; i < 256; i++) {
+ sprintf(name, "@%02x", i);
+ fan = cachefiles_get_directory(cache, vdentry, name, NULL);
+ if (IS_ERR(fan))
+ goto error_fan;
+ volume->fanout[i] = fan;
+ }
+
+ cachefiles_end_secure(cache, saved_cred);
+
+ vcookie->cache_priv = volume;
+ n_accesses = atomic_inc_return(&vcookie->n_accesses); /* Stop wakeups on dec-to-0 */
+ trace_fscache_access_volume(vcookie->debug_id, 0,
+ refcount_read(&vcookie->ref),
+ n_accesses, fscache_access_cache_pin);
+
+ spin_lock(&cache->object_list_lock);
+ list_add(&volume->cache_link, &volume->cache->volumes);
+ spin_unlock(&cache->object_list_lock);
+
+ kfree(name);
+ return;
+
+error_fan:
+ for (i = 0; i < 256; i++)
+ cachefiles_put_directory(volume->fanout[i]);
+error_dir:
+ cachefiles_put_directory(volume->dentry);
+error_name:
+ kfree(name);
+error_vol:
+ kfree(volume);
+ cachefiles_end_secure(cache, saved_cred);
+}
+
+/*
+ * Release a volume representation.
+ */
+static void __cachefiles_free_volume(struct cachefiles_volume *volume)
+{
+ int i;
+
+ _enter("");
+
+ volume->vcookie->cache_priv = NULL;
+
+ for (i = 0; i < 256; i++)
+ cachefiles_put_directory(volume->fanout[i]);
+ cachefiles_put_directory(volume->dentry);
+ kfree(volume);
+}
+
+void cachefiles_free_volume(struct fscache_volume *vcookie)
+{
+ struct cachefiles_volume *volume = vcookie->cache_priv;
+
+ if (volume) {
+ spin_lock(&volume->cache->object_list_lock);
+ list_del_init(&volume->cache_link);
+ spin_unlock(&volume->cache->object_list_lock);
+ __cachefiles_free_volume(volume);
+ }
+}
+
+void cachefiles_withdraw_volume(struct cachefiles_volume *volume)
+{
+ fscache_withdraw_volume(volume->vcookie);
+ cachefiles_set_volume_xattr(volume);
+ __cachefiles_free_volume(volume);
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 9e82de668595..00b087c14995 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* CacheFiles extended attribute management
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
@@ -15,310 +15,262 @@
#include <linux/slab.h>
#include "internal.h"
+#define CACHEFILES_COOKIE_TYPE_DATA 1
+
+struct cachefiles_xattr {
+ __be64 object_size; /* Actual size of the object */
+ __be64 zero_point; /* Size after which server has no data not written by us */
+ __u8 type; /* Type of object */
+ __u8 content; /* Content presence (enum cachefiles_content) */
+ __u8 data[]; /* netfs coherency data */
+} __packed;
+
static const char cachefiles_xattr_cache[] =
XATTR_USER_PREFIX "CacheFiles.cache";
+struct cachefiles_vol_xattr {
+ __be32 reserved; /* Reserved, should be 0 */
+ __u8 data[]; /* netfs volume coherency data */
+} __packed;
+
/*
- * check the type label on an object
- * - done using xattrs
+ * set the state xattr on a cache file
*/
-int cachefiles_check_object_type(struct cachefiles_object *object)
+int cachefiles_set_object_xattr(struct cachefiles_object *object)
{
- struct dentry *dentry = object->dentry;
- char type[3], xtype[3];
+ struct cachefiles_xattr *buf;
+ struct dentry *dentry;
+ struct file *file = object->file;
+ unsigned int len = object->cookie->aux_len;
int ret;
- ASSERT(dentry);
- ASSERT(d_backing_inode(dentry));
-
- if (!object->fscache.cookie)
- strcpy(type, "C3");
- else
- snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
-
- _enter("%x{%s}", object->fscache.debug_id, type);
+ if (!file)
+ return -ESTALE;
+ dentry = file->f_path.dentry;
- /* attempt to install a type label directly */
- ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type,
- 2, XATTR_CREATE);
- if (ret == 0) {
- _debug("SET"); /* we succeeded */
- goto error;
- }
+ _enter("%x,#%d", object->debug_id, len);
- if (ret != -EEXIST) {
- pr_err("Can't set xattr on %pd [%lu] (err %d)\n",
- dentry, d_backing_inode(dentry)->i_ino,
- -ret);
- goto error;
- }
+ buf = kmalloc(sizeof(struct cachefiles_xattr) + len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- /* read the current type label */
- ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype,
- 3);
+ buf->object_size = cpu_to_be64(object->cookie->object_size);
+ buf->zero_point = 0;
+ buf->type = CACHEFILES_COOKIE_TYPE_DATA;
+ buf->content = object->content_info;
+ if (test_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags))
+ buf->content = CACHEFILES_CONTENT_DIRTY;
+ if (len > 0)
+ memcpy(buf->data, fscache_get_aux(object->cookie), len);
+
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+ buf, sizeof(struct cachefiles_xattr) + len, 0);
if (ret < 0) {
- if (ret == -ERANGE)
- goto bad_type_length;
-
- pr_err("Can't read xattr on %pd [%lu] (err %d)\n",
- dentry, d_backing_inode(dentry)->i_ino,
- -ret);
- goto error;
+ trace_cachefiles_vfs_error(object, file_inode(file), ret,
+ cachefiles_trace_setxattr_error);
+ trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+ buf->content,
+ cachefiles_coherency_set_fail);
+ if (ret != -ENOMEM)
+ cachefiles_io_error_obj(
+ object,
+ "Failed to set xattr with error %d", ret);
+ } else {
+ trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+ buf->content,
+ cachefiles_coherency_set_ok);
}
- /* check the type is what we're expecting */
- if (ret != 2)
- goto bad_type_length;
-
- if (xtype[0] != type[0] || xtype[1] != type[1])
- goto bad_type;
-
- ret = 0;
-
-error:
+ kfree(buf);
_leave(" = %d", ret);
return ret;
-
-bad_type_length:
- pr_err("Cache object %lu type xattr length incorrect\n",
- d_backing_inode(dentry)->i_ino);
- ret = -EIO;
- goto error;
-
-bad_type:
- xtype[2] = 0;
- pr_err("Cache object %pd [%lu] type %s not %s\n",
- dentry, d_backing_inode(dentry)->i_ino,
- xtype, type);
- ret = -EIO;
- goto error;
}
/*
- * set the state xattr on a cache file
+ * check the consistency between the backing cache and the FS-Cache cookie
*/
-int cachefiles_set_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata)
+int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file)
{
- struct dentry *dentry = object->dentry;
- int ret;
-
- ASSERT(dentry);
-
- _enter("%p,#%d", object, auxdata->len);
+ struct cachefiles_xattr *buf;
+ struct dentry *dentry = file->f_path.dentry;
+ unsigned int len = object->cookie->aux_len, tlen;
+ const void *p = fscache_get_aux(object->cookie);
+ enum cachefiles_coherency_trace why;
+ ssize_t xlen;
+ int ret = -ESTALE;
- /* attempt to install the cache metadata directly */
- _debug("SET #%u", auxdata->len);
+ tlen = sizeof(struct cachefiles_xattr) + len;
+ buf = kmalloc(tlen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
- ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- &auxdata->type, auxdata->len, XATTR_CREATE);
- if (ret < 0 && ret != -ENOMEM)
- cachefiles_io_error_obj(
- object,
- "Failed to set xattr with error %d", ret);
+ xlen = cachefiles_inject_read_error();
+ if (xlen == 0)
+ xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen);
+ if (xlen != tlen) {
+ if (xlen < 0)
+ trace_cachefiles_vfs_error(object, file_inode(file), xlen,
+ cachefiles_trace_getxattr_error);
+ if (xlen == -EIO)
+ cachefiles_io_error_obj(
+ object,
+ "Failed to read aux with error %zd", xlen);
+ why = cachefiles_coherency_check_xattr;
+ } else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) {
+ why = cachefiles_coherency_check_type;
+ } else if (memcmp(buf->data, p, len) != 0) {
+ why = cachefiles_coherency_check_aux;
+ } else if (be64_to_cpu(buf->object_size) != object->cookie->object_size) {
+ why = cachefiles_coherency_check_objsize;
+ } else if (buf->content == CACHEFILES_CONTENT_DIRTY) {
+ // TODO: Begin conflict resolution
+ pr_warn("Dirty object in cache\n");
+ why = cachefiles_coherency_check_dirty;
+ } else {
+ why = cachefiles_coherency_check_ok;
+ ret = 0;
+ }
- _leave(" = %d", ret);
+ trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+ buf->content, why);
+ kfree(buf);
return ret;
}
/*
- * update the state xattr on a cache file
+ * remove the object's xattr to mark it stale
*/
-int cachefiles_update_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata)
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
+ struct dentry *dentry)
{
- struct dentry *dentry = object->dentry;
int ret;
- if (!dentry)
- return -ESTALE;
-
- _enter("%x,#%d", object->fscache.debug_id, auxdata->len);
-
- /* attempt to install the cache metadata directly */
- _debug("SET #%u", auxdata->len);
-
- clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
- ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- &auxdata->type, auxdata->len, XATTR_REPLACE);
- if (ret < 0 && ret != -ENOMEM)
- cachefiles_io_error_obj(
- object,
- "Failed to update xattr with error %d", ret);
+ ret = cachefiles_inject_remove_error();
+ if (ret == 0)
+ ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache);
+ if (ret < 0) {
+ trace_cachefiles_vfs_error(object, d_inode(dentry), ret,
+ cachefiles_trace_remxattr_error);
+ if (ret == -ENOENT || ret == -ENODATA)
+ ret = 0;
+ else if (ret != -ENOMEM)
+ cachefiles_io_error(cache,
+ "Can't remove xattr from %lu"
+ " (error %d)",
+ d_backing_inode(dentry)->i_ino, -ret);
+ }
_leave(" = %d", ret);
return ret;
}
/*
- * check the consistency between the backing cache and the FS-Cache cookie
+ * Stick a marker on the cache object to indicate that it's dirty.
*/
-int cachefiles_check_auxdata(struct cachefiles_object *object)
+void cachefiles_prepare_to_write(struct fscache_cookie *cookie)
{
- struct cachefiles_xattr *auxbuf;
- enum fscache_checkaux validity;
- struct dentry *dentry = object->dentry;
- ssize_t xlen;
- int ret;
-
- ASSERT(dentry);
- ASSERT(d_backing_inode(dentry));
- ASSERT(object->fscache.cookie->def->check_aux);
+ const struct cred *saved_cred;
+ struct cachefiles_object *object = cookie->cache_priv;
+ struct cachefiles_cache *cache = object->volume->cache;
- auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
- if (!auxbuf)
- return -ENOMEM;
+ _enter("c=%08x", object->cookie->debug_id);
- xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- &auxbuf->type, 512 + 1);
- ret = -ESTALE;
- if (xlen < 1 ||
- auxbuf->type != object->fscache.cookie->def->type)
- goto error;
-
- xlen--;
- validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen,
- i_size_read(d_backing_inode(dentry)));
- if (validity != FSCACHE_CHECKAUX_OKAY)
- goto error;
-
- ret = 0;
-error:
- kfree(auxbuf);
- return ret;
+ if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_set_object_xattr(object);
+ cachefiles_end_secure(cache, saved_cred);
+ }
}
/*
- * check the state xattr on a cache file
- * - return -ESTALE if the object should be deleted
+ * Set the state xattr on a volume directory.
*/
-int cachefiles_check_object_xattr(struct cachefiles_object *object,
- struct cachefiles_xattr *auxdata)
+bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
{
- struct cachefiles_xattr *auxbuf;
- struct dentry *dentry = object->dentry;
+ struct cachefiles_vol_xattr *buf;
+ unsigned int len = volume->vcookie->coherency_len;
+ const void *p = volume->vcookie->coherency;
+ struct dentry *dentry = volume->dentry;
int ret;
- _enter("%p,#%d", object, auxdata->len);
+ _enter("%x,#%d", volume->vcookie->debug_id, len);
- ASSERT(dentry);
- ASSERT(d_backing_inode(dentry));
-
- auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
- if (!auxbuf) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
+ len += sizeof(*buf);
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return false;
+ buf->reserved = cpu_to_be32(0);
+ memcpy(buf->data, p, volume->vcookie->coherency_len);
- /* read the current type label */
- ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- &auxbuf->type, 512 + 1);
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
+ ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+ buf, len, 0);
if (ret < 0) {
- if (ret == -ENODATA)
- goto stale; /* no attribute - power went off
- * mid-cull? */
-
- if (ret == -ERANGE)
- goto bad_type_length;
-
- cachefiles_io_error_obj(object,
- "Can't read xattr on %lu (err %d)",
- d_backing_inode(dentry)->i_ino, -ret);
- goto error;
- }
-
- /* check the on-disk object */
- if (ret < 1)
- goto bad_type_length;
-
- if (auxbuf->type != auxdata->type)
- goto stale;
-
- auxbuf->len = ret;
-
- /* consult the netfs */
- if (object->fscache.cookie->def->check_aux) {
- enum fscache_checkaux result;
- unsigned int dlen;
-
- dlen = auxbuf->len - 1;
-
- _debug("checkaux %s #%u",
- object->fscache.cookie->def->name, dlen);
-
- result = fscache_check_aux(&object->fscache,
- &auxbuf->data, dlen,
- i_size_read(d_backing_inode(dentry)));
-
- switch (result) {
- /* entry okay as is */
- case FSCACHE_CHECKAUX_OKAY:
- goto okay;
-
- /* entry requires update */
- case FSCACHE_CHECKAUX_NEEDS_UPDATE:
- break;
-
- /* entry requires deletion */
- case FSCACHE_CHECKAUX_OBSOLETE:
- goto stale;
-
- default:
- BUG();
- }
-
- /* update the current label */
- ret = vfs_setxattr(&init_user_ns, dentry,
- cachefiles_xattr_cache, &auxdata->type,
- auxdata->len, XATTR_REPLACE);
- if (ret < 0) {
- cachefiles_io_error_obj(object,
- "Can't update xattr on %lu"
- " (error %d)",
- d_backing_inode(dentry)->i_ino, -ret);
- goto error;
- }
+ trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
+ cachefiles_trace_setxattr_error);
+ trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino,
+ cachefiles_coherency_vol_set_fail);
+ if (ret != -ENOMEM)
+ cachefiles_io_error(
+ volume->cache, "Failed to set xattr with error %d", ret);
+ } else {
+ trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino,
+ cachefiles_coherency_vol_set_ok);
}
-okay:
- ret = 0;
-
-error:
- kfree(auxbuf);
+ kfree(buf);
_leave(" = %d", ret);
- return ret;
-
-bad_type_length:
- pr_err("Cache object %lu xattr length incorrect\n",
- d_backing_inode(dentry)->i_ino);
- ret = -EIO;
- goto error;
-
-stale:
- ret = -ESTALE;
- goto error;
+ return ret == 0;
}
/*
- * remove the object's xattr to mark it stale
+ * Check the consistency between the backing cache and the volume cookie.
*/
-int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
- struct dentry *dentry)
+int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
{
- int ret;
+ struct cachefiles_vol_xattr *buf;
+ struct dentry *dentry = volume->dentry;
+ unsigned int len = volume->vcookie->coherency_len;
+ const void *p = volume->vcookie->coherency;
+ enum cachefiles_coherency_trace why;
+ ssize_t xlen;
+ int ret = -ESTALE;
- ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache);
- if (ret < 0) {
- if (ret == -ENOENT || ret == -ENODATA)
- ret = 0;
- else if (ret != -ENOMEM)
- cachefiles_io_error(cache,
- "Can't remove xattr from %lu"
- " (error %d)",
- d_backing_inode(dentry)->i_ino, -ret);
+ _enter("");
+
+ len += sizeof(*buf);
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ xlen = cachefiles_inject_read_error();
+ if (xlen == 0)
+ xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len);
+ if (xlen != len) {
+ if (xlen < 0) {
+ trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen,
+ cachefiles_trace_getxattr_error);
+ if (xlen == -EIO)
+ cachefiles_io_error(
+ volume->cache,
+ "Failed to read xattr with error %zd", xlen);
+ }
+ why = cachefiles_coherency_vol_check_xattr;
+ } else if (buf->reserved != cpu_to_be32(0)) {
+ why = cachefiles_coherency_vol_check_resv;
+ } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) {
+ why = cachefiles_coherency_vol_check_cmp;
+ } else {
+ why = cachefiles_coherency_vol_check_ok;
+ ret = 0;
}
+ trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, why);
+ kfree(buf);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 99b80b5c7a93..b6edcf89a429 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -4,8 +4,8 @@
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
#include <linux/slab.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct page *page, void **_fsdata);
+ struct folio *folio, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- if (PageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- BUG_ON(!PagePrivate(page));
- return 0;
+ if (folio_test_dirty(folio)) {
+ dout("%p dirty_folio %p idx %lu -- already dirty\n",
+ mapping->host, folio, folio->index);
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
+ return false;
}
inode = mapping->host;
@@ -111,75 +110,81 @@ static int ceph_set_page_dirty(struct page *page)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
+ mapping->host, folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(PagePrivate(page));
- attach_page_private(page, snapc);
+ VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
+ folio_attach_private(folio, snapc);
- return __set_page_dirty_nobuffers(page);
+ return ceph_fscache_dirty_folio(mapping, folio);
}
/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
*/
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- wait_on_page_fscache(page);
-
- inode = page->mapping->host;
+ inode = folio->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != thp_size(page)) {
- dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
- inode, page, page->index, offset, length);
+ if (offset != 0 || length != folio_size(folio)) {
+ dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
+ inode, folio->index, offset, length);
return;
}
- WARN_ON(!PageLocked(page));
- if (!PagePrivate(page))
- return;
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_test_private(folio)) {
+ dout("%p invalidate_folio idx %lu full dirty page\n",
+ inode, folio->index);
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
+ snapc = folio_detach_private(folio);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ }
- snapc = detach_page_private(page);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
+ folio_wait_fscache(folio);
}
static int ceph_releasepage(struct page *page, gfp_t gfp)
{
- dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
- page, page->index, PageDirty(page) ? "" : "not ");
+ struct inode *inode = page->mapping->host;
+
+ dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n",
+ ceph_vinop(inode), page,
+ page->index, PageDirty(page) ? "" : "not ");
+
+ if (PagePrivate(page))
+ return 0;
if (PageFsCache(page)) {
- if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
return 0;
wait_on_page_fscache(page);
}
- return !PagePrivate(page);
+ ceph_fscache_note_page_release(inode);
+ return 1;
}
-static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
+static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
@@ -194,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
rreq->len = roundup(rreq->len, lo->stripe_unit);
}
-static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = subreq->rreq->mapping->host;
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -213,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- struct netfs_read_subrequest *subreq = req->r_priv;
+ struct netfs_io_subrequest *subreq = req->r_priv;
int num_pages;
int err = req->r_result;
@@ -239,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
iput(req->r_inode);
}
-static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
+}
+
+static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct netfs_read_request *rreq = subreq->rreq;
- struct inode *inode = rreq->mapping->host;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -253,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ ceph_netfs_issue_op_inline(subreq))
+ return;
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -291,8 +353,43 @@ out:
dout("%s: result %d\n", __func__, err);
}
-static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = rreq->inode;
+ int got = 0, want = CEPH_CAP_FILE_CACHE;
+ int ret = 0;
+
+ if (rreq->origin != NETFS_READAHEAD)
+ return 0;
+
+ if (file) {
+ struct ceph_rw_context *rw_ctx;
+ struct ceph_file_info *fi = file->private_data;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ if (rw_ctx)
+ return 0;
+ }
+
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ return ret;
+ }
+
+ if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ return -EACCES;
+ }
+ if (ret == 0)
+ return -EACCES;
+
+ rreq->netfs_priv = (void *)(uintptr_t)got;
+ return 0;
}
static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
@@ -305,77 +402,47 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
ceph_put_cap_refs(ci, got);
}
-static const struct netfs_read_request_ops ceph_netfs_read_ops = {
- .init_rreq = ceph_init_rreq,
- .is_cache_enabled = ceph_is_cache_enabled,
+const struct netfs_request_ops ceph_netfs_ops = {
+ .init_request = ceph_init_request,
.begin_cache_operation = ceph_begin_cache_operation,
- .issue_op = ceph_netfs_issue_op,
+ .issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
.cleanup = ceph_readahead_cleanup,
};
-/* read a single page, without unlocking it. */
-static int ceph_readpage(struct file *file, struct page *page)
+#ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
{
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vino vino = ceph_vino(inode);
- u64 off = page_offset(page);
- u64 len = thp_size(page);
-
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0) {
- unlock_page(page);
- return -EINVAL;
- }
- zero_user_segment(page, 0, thp_size(page));
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
- }
-
- dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
- vino.ino, vino.snap, file, off, len, page, page->index);
-
- return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
+ set_page_fscache(page);
}
-static void ceph_readahead(struct readahead_control *ractl)
+static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
- struct inode *inode = file_inode(ractl->file);
- struct ceph_file_info *fi = ractl->file->private_data;
- struct ceph_rw_context *rw_ctx;
- int got = 0;
- int ret = 0;
+ struct inode *inode = priv;
- if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
- return;
+ if (IS_ERR_VALUE(error) && error != -ENOBUFS)
+ ceph_fscache_invalidate(inode, false);
+}
- rw_ctx = ceph_find_rw_context(fi);
- if (!rw_ctx) {
- /*
- * readahead callers do not necessarily hold Fcb caps
- * (e.g. fadvise, madvise).
- */
- int want = CEPH_CAP_FILE_CACHE;
+static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
- ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
- if (ret < 0)
- dout("start_read %p, error getting cap\n", inode);
- else if (!(got & want))
- dout("start_read %p, no cache cap\n", inode);
+ fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
+ ceph_fscache_write_terminated, inode, caching);
+}
+#else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
- if (ret <= 0)
- return;
- }
- netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
+static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
}
+#endif /* CONFIG_CEPH_FSCACHE */
struct ceph_writeback_ctl
{
@@ -482,6 +549,7 @@ static u64 get_writepages_data_length(struct inode *inode,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -492,6 +560,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
+ bool caching = ceph_is_cache_enabled(inode);
dout("writepage %p idx %lu\n", page, page->index);
@@ -515,8 +584,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
+ dout("folio at %lu beyond eof %llu\n", folio->index,
+ ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -528,18 +598,19 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = true;
- set_page_writeback(page);
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true);
- if (IS_ERR(req)) {
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ if (IS_ERR(req))
return PTR_ERR(req);
- }
+
+ set_page_writeback(page);
+ if (caching)
+ ceph_set_page_fscache(page);
+ ceph_fscache_write_to_cache(inode, page_off, len, caching);
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > thp_size(page));
@@ -587,7 +658,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = false;
return err;
}
@@ -598,6 +669,13 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
BUG_ON(!inode);
ihold(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ ceph_inode_to_client(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ wait_on_page_fscache(page);
+
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -651,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
@@ -668,8 +749,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
+ fsc->write_congested = false;
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
@@ -719,12 +799,17 @@ static int ceph_writepages_start(struct address_space *mapping,
struct ceph_writeback_ctl ceph_wbc;
bool should_loop, range_whole = false;
bool done = false;
+ bool caching = ceph_is_cache_enabled(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fsc->write_congested)
+ return 0;
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+ if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
pr_warn_ratelimited(
"writepage_start %p %lld forced umount\n",
@@ -827,14 +912,16 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n",
- page, ceph_wbc.i_size);
+ struct folio *folio = page_folio(page);
+
+ dout("folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
- page_offset(page) >= i_size_read(inode)) &&
- clear_page_dirty_for_io(page))
- mapping->a_ops->invalidatepage(page,
- 0, thp_size(page));
- unlock_page(page);
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
@@ -842,7 +929,7 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page)) {
+ if (PageWriteback(page) || PageFsCache(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
dout("%p under writeback\n", page);
unlock_page(page);
@@ -850,6 +937,7 @@ get_more_pages:
}
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
+ wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
@@ -913,11 +1001,8 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = true;
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
@@ -982,9 +1067,19 @@ new_request:
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
u64 cur_offset = page_offset(pages[i]);
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
if (offset + len != cur_offset) {
+ /* If it's full, stop here */
if (op_idx + 1 == req->r_num_ops)
break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
dout("writepages got pages at %llu~%llu\n",
@@ -995,14 +1090,17 @@ new_request:
osd_req_op_extent_update(req, op_idx, len);
len = 0;
- offset = cur_offset;
+ offset = cur_offset;
data_pages = pages + i;
op_idx++;
}
set_page_writeback(pages[i]);
+ if (caching)
+ ceph_set_page_fscache(pages[i]);
len += thp_size(page);
}
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
if (ceph_wbc.size_stable) {
len = min(len, ceph_wbc.i_size - offset);
@@ -1145,12 +1243,12 @@ static struct ceph_snap_context *
ceph_find_incompatible(struct page *page)
{
struct inode *inode = page->mapping->host;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
- dout(" page %p forced umount\n", page);
- return ERR_PTR(-EIO);
+ if (ceph_inode_is_shutdown(inode)) {
+ dout(" page %p %llx:%llx is shutdown\n", page,
+ ceph_vinop(inode));
+ return ERR_PTR(-ESTALE);
}
for (;;) {
@@ -1187,18 +1285,18 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct page *page, void **_fsdata)
+ struct folio *folio, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(page);
+ snapc = ceph_find_incompatible(folio_page(folio, 0));
if (snapc) {
int r;
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (IS_ERR(snapc))
return PTR_ERR(snapc);
@@ -1216,53 +1314,22 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len, unsigned aop_flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = NULL;
- pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio = NULL;
int r;
- /*
- * Uninlining should have already been done and everything updated, EXCEPT
- * for inline_version sent to the MDS.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- /*
- * The inline_version on a new inode is set to 1. If that's the
- * case, then the page is brand new and isn't yet Uptodate.
- */
- r = 0;
- if (index == 0 && ci->i_inline_version != 1) {
- if (!PageUptodate(page)) {
- WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
- ci->i_inline_version);
- r = -EINVAL;
- }
- goto out;
- }
- zero_user_segment(page, 0, thp_size(page));
- SetPageUptodate(page);
- goto out;
- }
-
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
- &ceph_netfs_read_ops, NULL);
-out:
+ r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL);
if (r == 0)
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
if (r < 0) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
} else {
- WARN_ON_ONCE(!PageLocked(page));
- *pagep = page;
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
}
return r;
}
@@ -1273,32 +1340,33 @@ out:
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct page *subpage, void *fsdata)
{
+ struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
bool check_cap = false;
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
+ dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
+ inode, folio, (int)pos, (int)copied, (int)len);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* just return that nothing was copied on a short copy */
if (copied < len) {
copied = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* did file size increase? */
if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1306,28 +1374,17 @@ out:
return copied;
}
-/*
- * we set .direct_IO to indicate direct io is supported, but since we
- * intercept O_DIRECT reads and writes early, this function should
- * never get called.
- */
-static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
-{
- WARN_ON(1);
- return -EINVAL;
-}
-
const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readahead = ceph_readahead,
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
.releasepage = ceph_releasepage,
- .direct_IO = ceph_direct_io,
+ .direct_IO = noop_direct_IO,
};
static void ceph_block_sigs(sigset_t *oldset)
@@ -1356,6 +1413,9 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
ceph_block_sigs(&oldset);
dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
@@ -1447,6 +1507,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return VM_FAULT_OOM;
@@ -1454,19 +1517,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
if (off + thp_size(page) <= size)
len = thp_size(page);
else
@@ -1523,11 +1573,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1591,16 +1639,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
+ }
+
+ folio_lock(folio);
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1611,45 +1673,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
- goto out;
-
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
+ goto out_unlock;
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
- }
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
@@ -1657,7 +1685,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
@@ -1666,7 +1694,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1675,10 +1703,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1688,7 +1717,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1699,7 +1728,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
@@ -1710,19 +1739,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-out_put:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
- }
-
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 9cfadbb86568..ddea99922073 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -12,216 +12,99 @@
#include "super.h"
#include "cache.h"
-struct ceph_aux_inode {
- u64 version;
- u64 mtime_sec;
- u64 mtime_nsec;
-};
-
-struct fscache_netfs ceph_cache_netfs = {
- .name = "ceph",
- .version = 0,
-};
-
-static DEFINE_MUTEX(ceph_fscache_lock);
-static LIST_HEAD(ceph_fscache_list);
-
-struct ceph_fscache_entry {
- struct list_head list;
- struct fscache_cookie *fscache;
- size_t uniq_len;
- /* The following members must be last */
- struct ceph_fsid fsid;
- char uniquifier[];
-};
-
-static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
- .name = "CEPH.fsid",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-int __init ceph_fscache_register(void)
-{
- return fscache_register_netfs(&ceph_cache_netfs);
-}
-
-void ceph_fscache_unregister(void)
-{
- fscache_unregister_netfs(&ceph_cache_netfs);
-}
-
-int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
+void ceph_fscache_register_inode_cookie(struct inode *inode)
{
- const struct ceph_fsid *fsid = &fsc->client->fsid;
- const char *fscache_uniq = fsc->mount_options->fscache_uniq;
- size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
- struct ceph_fscache_entry *ent;
- int err = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
- continue;
- if (ent->uniq_len != uniq_len)
- continue;
- if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
- continue;
-
- errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option",
- fsid);
- err = -EBUSY;
- goto out_unlock;
- }
+ /* No caching for filesystem? */
+ if (!fsc->fscache)
+ return;
- ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
- if (!ent) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ /* Regular files only */
+ if (!S_ISREG(inode->i_mode))
+ return;
- memcpy(&ent->fsid, fsid, sizeof(*fsid));
- if (uniq_len > 0) {
- memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
- ent->uniq_len = uniq_len;
- }
+ /* Only new inodes! */
+ if (!(inode->i_state & I_NEW))
+ return;
- fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
- &ceph_fscache_fsid_object_def,
- &ent->fsid, sizeof(ent->fsid) + uniq_len,
- NULL, 0,
- fsc, 0, true);
+ WARN_ON_ONCE(ci->netfs_ctx.cache);
- if (fsc->fscache) {
- ent->fscache = fsc->fscache;
- list_add_tail(&ent->list, &ceph_fscache_list);
- } else {
- kfree(ent);
- errorfc(fc, "unable to register fscache cookie for fsid %pU",
- fsid);
- /* all other fs ignore this error */
- }
-out_unlock:
- mutex_unlock(&ceph_fscache_lock);
- return err;
+ ci->netfs_ctx.cache =
+ fscache_acquire_cookie(fsc->fscache, 0,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
}
-static enum fscache_checkaux ceph_fscache_inode_check_aux(
- void *cookie_netfs_data, const void *data, uint16_t dlen,
- loff_t object_size)
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
{
- struct ceph_aux_inode aux;
- struct ceph_inode_info* ci = cookie_netfs_data;
- struct inode* inode = &ci->vfs_inode;
-
- if (dlen != sizeof(aux) ||
- i_size_read(inode) != object_size)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime_sec = inode->i_mtime.tv_sec;
- aux.mtime_nsec = inode->i_mtime.tv_nsec;
+ fscache_relinquish_cookie(ceph_fscache_cookie(ci), false);
+}
- if (memcmp(data, &aux, sizeof(aux)) != 0)
- return FSCACHE_CHECKAUX_OBSOLETE;
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
- dout("ceph inode 0x%p cached okay\n", ci);
- return FSCACHE_CHECKAUX_OKAY;
+ fscache_use_cookie(ceph_fscache_cookie(ci), will_modify);
}
-static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
- .name = "CEPH.inode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = ceph_fscache_inode_check_aux,
-};
-
-void ceph_fscache_register_inode_cookie(struct inode *inode)
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_aux_inode aux;
-
- /* No caching for filesystem */
- if (!fsc->fscache)
- return;
- /* Only cache for regular files that are read only */
- if (!S_ISREG(inode->i_mode))
- return;
+ if (update) {
+ loff_t i_size = i_size_read(inode);
- inode_lock_nested(inode, I_MUTEX_CHILD);
- if (!ci->fscache) {
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime_sec = inode->i_mtime.tv_sec;
- aux.mtime_nsec = inode->i_mtime.tv_nsec;
- ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- &ci->i_vino, sizeof(ci->i_vino),
- &aux, sizeof(aux),
- ci, i_size_read(inode), false);
+ fscache_unuse_cookie(ceph_fscache_cookie(ci),
+ &ci->i_version, &i_size);
+ } else {
+ fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL);
}
- inode_unlock(inode);
}
-void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+void ceph_fscache_update(struct inode *inode)
{
- struct fscache_cookie* cookie;
-
- if ((cookie = ci->fscache) == NULL)
- return;
-
- ci->fscache = NULL;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ loff_t i_size = i_size_read(inode);
- fscache_relinquish_cookie(cookie, &ci->i_vino, false);
+ fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size);
}
-static bool ceph_fscache_can_enable(void *data)
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
- struct inode *inode = data;
- return !inode_is_open_for_write(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ fscache_invalidate(ceph_fscache_cookie(ci),
+ &ci->i_version, i_size_read(inode),
+ dio_write ? FSCACHE_INVAL_DIO_WRITE : 0);
}
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ char *name;
+ int err = 0;
- if (!fscache_cookie_valid(ci->fscache))
- return;
+ name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "",
+ uniq_len ? fscache_uniq : "");
+ if (!name)
+ return -ENOMEM;
- if (inode_is_open_for_write(inode)) {
- dout("fscache_file_set_cookie %p %p disabling cache\n",
- inode, filp);
- fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
- } else {
- fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
- ceph_fscache_can_enable, inode);
- if (fscache_cookie_enabled(ci->fscache)) {
- dout("fscache_file_set_cookie %p %p enabling cache\n",
- inode, filp);
- }
+ fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(fsc->fscache)) {
+ errorfc(fc, "Unable to register fscache cookie for %s", name);
+ err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP;
+ fsc->fscache = NULL;
}
+ kfree(name);
+ return err;
}
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- if (fscache_cookie_valid(fsc->fscache)) {
- struct ceph_fscache_entry *ent;
- bool found = false;
-
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (ent->fscache == fsc->fscache) {
- list_del(&ent->list);
- kfree(ent);
- found = true;
- break;
- }
- }
- WARN_ON_ONCE(!found);
- mutex_unlock(&ceph_fscache_lock);
-
- __fscache_relinquish_cookie(fsc->fscache, NULL, false);
- }
- fsc->fscache = NULL;
+ fscache_relinquish_volume(fsc->fscache, NULL, false);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 058ea2a04376..7255b790a4c1 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -12,61 +12,70 @@
#include <linux/netfs.h>
#ifdef CONFIG_CEPH_FSCACHE
-
-extern struct fscache_netfs ceph_cache_netfs;
-
-int ceph_fscache_register(void);
-void ceph_fscache_unregister(void);
+#include <linux/fscache.h>
int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
void ceph_fscache_register_inode_cookie(struct inode *inode);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify);
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update);
+
+void ceph_fscache_update(struct inode *inode);
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
+
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- ci->fscache = NULL;
+ return netfs_i_cookie(&ci->vfs_inode);
}
-static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
- return ci->fscache;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
+
+ if (cookie) {
+ ceph_fscache_use_cookie(inode, true);
+ fscache_resize_cookie(cookie, to);
+ ceph_fscache_unuse_cookie(inode, true);
+ }
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
- fscache_invalidate(ceph_inode(inode)->fscache);
+ fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline bool ceph_is_cache_enabled(struct inode *inode)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode));
+ struct ceph_inode_info *ci = ceph_inode(mapping->host);
- if (!cookie)
- return false;
- return fscache_cookie_enabled(cookie);
+ return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
- return fscache_begin_read_operation(rreq, cookie);
+ return fscache_begin_read_operation(&rreq->cache_resources, cookie);
}
-#else
-static inline int ceph_fscache_register(void)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
- return 0;
+ return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_unregister(void)
+static inline void ceph_fscache_note_page_release(struct inode *inode)
{
-}
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_note_page_release(ceph_fscache_cookie(ci));
+}
+#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
{
@@ -77,41 +86,63 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
{
}
-static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
- return NULL;
}
-static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
{
}
-static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
}
-static inline void ceph_fscache_file_set_cookie(struct inode *inode,
- struct file *filp)
+static inline void ceph_fscache_update(struct inode *inode)
{
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
}
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+ return NULL;
+}
+
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
+{
+}
+
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
+{
+}
+
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ return filemap_dirty_folio(mapping, folio);
+}
+
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
return -ENOBUFS;
}
-#endif
-#endif /* _CEPH_CACHE_H */
+static inline void ceph_fscache_note_page_release(struct inode *inode)
+{
+}
+#endif /* CONFIG_CEPH_FSCACHE */
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8f537f1d9d1d..5c14ef04e474 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ fsc = ceph_inode_to_client(&ci->vfs_inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
+ !ceph_inode_is_shutdown(&ci->vfs_inode));
__ceph_remove_cap(cap, queue_release);
}
@@ -1856,7 +1856,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode);
+ ceph_fscache_invalidate(inode, false);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
@@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ return;
+ }
+
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
retry:
@@ -1968,8 +1975,8 @@ retry:
}
}
- dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s\n", inode,
+ dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
+ " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode),
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
@@ -1990,7 +1997,8 @@ retry:
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
- dout("check_caps trying to invalidate on %p\n", inode);
+ dout("check_caps trying to invalidate on %llx.%llx\n",
+ ceph_vinop(inode));
if (try_nonblocking_invalidate(inode) < 0) {
dout("check_caps queuing invalidate\n");
queue_invalidate = true;
@@ -2217,6 +2225,7 @@ static int unsafe_request_wait(struct inode *inode)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2235,36 +2244,46 @@ static int unsafe_request_wait(struct inode *inode)
spin_unlock(&ci->i_unsafe_lock);
/*
+ * The mdsc->max_sessions is unlikely to be changed
+ * mostly, here we will retry it by reallocating the
+ * sessions array memory to get rid of the mdsc->mutex
+ * lock.
+ */
+retry:
+ max_sessions = mdsc->max_sessions;
+
+ /*
* Trigger to flush the journal logs in all the relevant MDSes
* manually, or in the worst case we must wait at most 5 seconds
* to wait the journal logs to be flushed by the MDSes periodically.
*/
- if (req1 || req2) {
+ if ((req1 || req2) && likely(max_sessions)) {
struct ceph_mds_session **sessions = NULL;
struct ceph_mds_session *s;
struct ceph_mds_request *req;
- unsigned int max;
int i;
- /*
- * The mdsc->max_sessions is unlikely to be changed
- * mostly, here we will retry it by reallocating the
- * sessions arrary memory to get rid of the mdsc->mutex
- * lock.
- */
-retry:
- max = mdsc->max_sessions;
- sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
- if (!sessions)
- return -ENOMEM;
+ sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL);
+ if (!sessions) {
+ err = -ENOMEM;
+ goto out;
+ }
spin_lock(&ci->i_unsafe_lock);
if (req1) {
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
- if (unlikely(s->s_mds >= max)) {
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
goto retry;
}
if (!sessions[s->s_mds]) {
@@ -2277,8 +2296,16 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
- if (unlikely(s->s_mds >= max)) {
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
goto retry;
}
if (!sessions[s->s_mds]) {
@@ -2299,7 +2326,7 @@ retry:
spin_unlock(&ci->i_ceph_lock);
/* send flush mdlog request to MDSes */
- for (i = 0; i < max; i++) {
+ for (i = 0; i < max_sessions; i++) {
s = sessions[i];
if (s) {
send_flush_mdlog(s);
@@ -2316,15 +2343,19 @@ retry:
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req2);
}
+
+out:
+ if (req1)
+ ceph_mdsc_put_request(req1);
+ if (req2)
+ ceph_mdsc_put_request(req2);
return err;
}
@@ -2387,7 +2418,11 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
dout("write_inode %p wait=%d\n", inode, wait);
+ ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
@@ -2418,6 +2453,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
u64 first_tid = 0;
u64 last_snap_flush = 0;
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
@@ -2629,9 +2668,9 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
*
* Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
* or a negative error code. There are 3 speical error codes:
- * -EAGAIN: need to sleep but non-blocking is specified
- * -EFBIG: ask caller to call check_max_size() and try again.
- * -ESTALE: ask caller to call ceph_renew_caps() and try again.
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
*/
enum {
/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
@@ -2679,7 +2718,7 @@ again:
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
- ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
+ ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
goto out_unlock;
}
/*
@@ -2749,9 +2788,9 @@ again:
goto out_unlock;
}
- if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- ret = -EIO;
+ if (ceph_inode_is_shutdown(inode)) {
+ dout("get_cap_refs %p inode is shutdown\n", inode);
+ ret = -ESTALE;
goto out_unlock;
}
mds_wanted = __ceph_caps_mds_wanted(ci, false);
@@ -2759,7 +2798,7 @@ again:
dout("get_cap_refs %p need %s > mds_wanted %s\n",
inode, ceph_cap_string(need),
ceph_cap_string(mds_wanted));
- ret = -ESTALE;
+ ret = -EUCLEAN;
goto out_unlock;
}
@@ -2843,7 +2882,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
ret = try_get_cap_refs(inode, need, want, 0, flags, got);
/* three special error codes */
- if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
ret = 0;
return ret;
}
@@ -2926,7 +2965,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
}
if (ret < 0) {
- if (ret == -EFBIG || ret == -ESTALE) {
+ if (ret == -EFBIG || ret == -EUCLEAN) {
int ret2 = ceph_wait_on_async_create(inode);
if (ret2 < 0)
return ret2;
@@ -2935,7 +2974,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
check_max_size(inode, endoff);
continue;
}
- if (ret == -ESTALE) {
+ if (ret == -EUCLEAN) {
/* session was killed, try renew caps */
ret = ceph_renew_caps(inode, flags);
if (ret == 0)
@@ -3374,8 +3413,7 @@ static void handle_cap_grant(struct inode *inode,
if ((newcaps & CEPH_CAP_LINK_SHARED) &&
(extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
- if (inode->i_nlink == 0 &&
- (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ if (inode->i_nlink == 0)
deleted_inode = true;
}
@@ -3836,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3899,6 +3938,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3924,6 +3964,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
@@ -4132,7 +4173,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
@@ -4158,6 +4198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
@@ -4315,7 +4356,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
i_dirty_item);
inode = &ci->vfs_inode;
ihold(inode);
- dout("flush_dirty_caps %p\n", inode);
+ dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
@@ -4349,7 +4390,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
- bool is_opened = false;
+ bool already_opened = false;
int i;
if (count == 1)
@@ -4357,19 +4398,19 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (bits & (1 << i))
- ci->i_nr_by_mode[i] += count;
-
/*
- * If any of the mode ref is larger than 1,
+ * If any of the mode ref is larger than 0,
* that means it has been already opened by
* others. Just skip checking the PIN ref.
*/
- if (i && ci->i_nr_by_mode[i] > 1)
- is_opened = true;
+ if (i && ci->i_nr_by_mode[i])
+ already_opened = true;
+
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i] += count;
}
- if (!is_opened)
+ if (!already_opened)
percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@@ -4560,3 +4601,119 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_unlock(&dentry->d_lock);
return ret;
}
+
+static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_snap *capsnap;
+ int capsnap_release = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+
+ while (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ capsnap_release++;
+ }
+ wake_up_all(&ci->i_cap_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
+ return capsnap_release;
+}
+
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_auth;
+ bool dirty_dropped = false;
+ int iputs = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->vfs_inode);
+
+ is_auth = (cap == ci->i_auth_cap);
+ __ceph_remove_cap(cap, false);
+ if (is_auth) {
+ struct ceph_cap_flush *cf;
+
+ if (ceph_inode_is_shutdown(inode)) {
+ if (inode->i_data.nrpages > 0)
+ *invalidate = true;
+ if (ci->i_wrbuffer_ref > 0)
+ mapping_set_error(&inode->i_data, -EIO);
+ }
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ /* trash all of the cap flushes for this inode */
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del_init(&cf->g_list);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_dirty_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty %s state for %p %lld\n",
+ ceph_cap_string(ci->i_dirty_caps),
+ inode, ceph_ino(inode));
+ ci->i_dirty_caps = 0;
+ list_del_init(&ci->i_dirty_item);
+ dirty_dropped = true;
+ }
+ if (!list_empty(&ci->i_flushing_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty+flushing %s state for %p %lld\n",
+ ceph_cap_string(ci->i_flushing_caps),
+ inode, ceph_ino(inode));
+ ci->i_flushing_caps = 0;
+ list_del_init(&ci->i_flushing_item);
+ mdsc->num_cap_flushing--;
+ dirty_dropped = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (dirty_dropped) {
+ mapping_set_error(inode->i_mapping, -EIO);
+
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ }
+
+ if (atomic_read(&ci->i_filelock_ref) > 0) {
+ /* make further file lock syscall return -EIO */
+ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+ pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+ inode, ceph_ino(inode));
+ }
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ cf = ci->i_prealloc_cap_flush;
+ ci->i_prealloc_cap_flush = NULL;
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_cap_snaps))
+ iputs = remove_capsnaps(mdsc, inode);
+ }
+ if (dirty_dropped)
+ ++iputs;
+ return iputs;
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 38b78b45811f..bec3c4549c07 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -146,82 +146,92 @@ static int mdsc_show(struct seq_file *s, void *p)
name, total, avg, _min, max, sum); \
}
-static int metric_show(struct seq_file *s, void *p)
+static int metrics_file_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_client_metric *m = &mdsc->metric;
- int nr_caps = 0;
- s64 total, sum, avg, min, max, sq;
- u64 sum_sz, avg_sz, min_sz, max_sz;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
- sum = percpu_counter_sum(&m->total_inodes);
seq_printf(s, "item total\n");
seq_printf(s, "------------------------------------------\n");
- seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
- atomic64_read(&m->opened_files), sum);
- seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
- atomic64_read(&m->total_caps), sum);
- seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
- percpu_counter_sum(&m->opened_inodes), sum);
-
- seq_printf(s, "\n");
+ seq_printf(s, "%-35s%lld\n", "total inodes",
+ percpu_counter_sum(&m->total_inodes));
+ seq_printf(s, "%-35s%lld\n", "opened files",
+ atomic64_read(&m->opened_files));
+ seq_printf(s, "%-35s%lld\n", "pinned i_caps",
+ atomic64_read(&m->total_caps));
+ seq_printf(s, "%-35s%lld\n", "opened inodes",
+ percpu_counter_sum(&m->opened_inodes));
+ return 0;
+}
+
+static const char * const metric_str[] = {
+ "read",
+ "write",
+ "metadata",
+ "copyfrom"
+};
+static int metrics_latency_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total, avg, min, max, sq;
+ int i;
+
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
seq_printf(s, "-----------------------------------------------------------------------------------\n");
- spin_lock(&m->read_metric_lock);
- total = m->total_reads;
- sum = m->read_latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
- min = m->read_latency_min;
- max = m->read_latency_max;
- sq = m->read_latency_sq_sum;
- spin_unlock(&m->read_metric_lock);
- CEPH_LAT_METRIC_SHOW("read", total, avg, min, max, sq);
-
- spin_lock(&m->write_metric_lock);
- total = m->total_writes;
- sum = m->write_latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
- min = m->write_latency_min;
- max = m->write_latency_max;
- sq = m->write_latency_sq_sum;
- spin_unlock(&m->write_metric_lock);
- CEPH_LAT_METRIC_SHOW("write", total, avg, min, max, sq);
-
- spin_lock(&m->metadata_metric_lock);
- total = m->total_metadatas;
- sum = m->metadata_latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
- min = m->metadata_latency_min;
- max = m->metadata_latency_max;
- sq = m->metadata_latency_sq_sum;
- spin_unlock(&m->metadata_metric_lock);
- CEPH_LAT_METRIC_SHOW("metadata", total, avg, min, max, sq);
-
- seq_printf(s, "\n");
+ for (i = 0; i < METRIC_MAX; i++) {
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ avg = m->latency_avg;
+ min = m->latency_min;
+ max = m->latency_max;
+ sq = m->latency_sq_sum;
+ spin_unlock(&m->lock);
+ CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq);
+ }
+
+ return 0;
+}
+
+static int metrics_size_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total;
+ u64 sum, avg, min, max;
+ int i;
+
seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n");
seq_printf(s, "----------------------------------------------------------------------------------------\n");
- spin_lock(&m->read_metric_lock);
- total = m->total_reads;
- sum_sz = m->read_size_sum;
- avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0;
- min_sz = m->read_size_min;
- max_sz = m->read_size_max;
- spin_unlock(&m->read_metric_lock);
- CEPH_SZ_METRIC_SHOW("read", total, avg_sz, min_sz, max_sz, sum_sz);
-
- spin_lock(&m->write_metric_lock);
- total = m->total_writes;
- sum_sz = m->write_size_sum;
- avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0;
- min_sz = m->write_size_min;
- max_sz = m->write_size_max;
- spin_unlock(&m->write_metric_lock);
- CEPH_SZ_METRIC_SHOW("write", total, avg_sz, min_sz, max_sz, sum_sz);
-
- seq_printf(s, "\n");
+ for (i = 0; i < METRIC_MAX; i++) {
+ /* skip 'metadata' as it doesn't use the size metric */
+ if (i == METRIC_METADATA)
+ continue;
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ sum = m->size_sum;
+ avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ min = m->size_min;
+ max = m->size_max;
+ spin_unlock(&m->lock);
+ CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum);
+ }
+
+ return 0;
+}
+
+static int metrics_caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
+ int nr_caps = 0;
+
seq_printf(s, "item total miss hit\n");
seq_printf(s, "-------------------------------------------------\n");
@@ -350,8 +360,11 @@ DEFINE_SHOW_ATTRIBUTE(mdsmap);
DEFINE_SHOW_ATTRIBUTE(mdsc);
DEFINE_SHOW_ATTRIBUTE(caps);
DEFINE_SHOW_ATTRIBUTE(mds_sessions);
-DEFINE_SHOW_ATTRIBUTE(metric);
DEFINE_SHOW_ATTRIBUTE(status);
+DEFINE_SHOW_ATTRIBUTE(metrics_file);
+DEFINE_SHOW_ATTRIBUTE(metrics_latency);
+DEFINE_SHOW_ATTRIBUTE(metrics_size);
+DEFINE_SHOW_ATTRIBUTE(metrics_caps);
/*
@@ -385,8 +398,9 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mdsmap);
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
- debugfs_remove(fsc->debugfs_metric);
+ debugfs_remove(fsc->debugfs_status);
debugfs_remove(fsc->debugfs_mdsc);
+ debugfs_remove_recursive(fsc->debugfs_metrics_dir);
}
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -426,12 +440,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
fsc,
&mdsc_fops);
- fsc->debugfs_metric = debugfs_create_file("metrics",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &metric_fops);
-
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
@@ -443,6 +451,18 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
fsc->client->debugfs_dir,
fsc,
&status_fops);
+
+ fsc->debugfs_metrics_dir = debugfs_create_dir("metrics",
+ fsc->client->debugfs_dir);
+
+ debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_file_fops);
+ debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_latency_fops);
+ debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_size_fops);
+ debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_caps_fops);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 133dbd9338e7..eae417d71136 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_mutex, no need to use page lock */
+ i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
+ * marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
@@ -478,8 +478,11 @@ more:
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -520,6 +523,12 @@ more:
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 1d65934c1262..e0fa66ac8b9f 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -157,6 +157,11 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
ceph_mdsc_put_request(req);
if (!inode)
return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
+ } else {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
}
return inode;
}
@@ -223,8 +228,13 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
inode = ceph_find_inode(sb, vino);
- if (inode)
+ if (inode) {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
return d_obtain_alias(inode);
+ }
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e61018d9764e..8c8226c0feac 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -204,7 +204,10 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
int fmode, bool isdir)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
struct ceph_file_info *fi;
+ int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
@@ -225,6 +228,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (!fi)
return -ENOMEM;
+ if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ fi->flags |= CEPH_F_SYNC;
+
file->private_data = fi;
}
@@ -235,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ if ((file->f_mode & FMODE_WRITE) &&
+ ci->i_inline_version != CEPH_INLINE_NONE) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
}
/*
@@ -248,8 +269,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- ceph_fscache_register_inode_cookie(inode);
- ceph_fscache_file_set_cookie(inode, file);
+ ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
fallthrough;
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
@@ -512,51 +532,68 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
}
}
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- struct dentry *dentry = req->r_dentry;
int pathlen = 0;
u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
- /* FIXME: start returning I/O errors on all accesses? */
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
}
- if (req->r_target_inode) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- u64 ino = ceph_vino(req->r_target_inode).ino;
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
- mapping_set_error(req->r_target_inode->i_mapping, result);
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
- ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
- wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
- }
- ceph_kick_flushing_inode_caps(req->r_session, ci);
- spin_unlock(&ci->i_ceph_lock);
- } else {
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
+ } else if (!result) {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
}
@@ -577,6 +614,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_inode_info *ci = ceph_inode(dir);
struct inode *inode;
struct timespec64 now;
+ struct ceph_string *pool_ns;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
@@ -591,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -603,17 +647,35 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
in.cap.flags = CEPH_CAP_FLAG_AUTH;
in.ctime = in.mtime = in.atime = iinfo.btime;
- in.mode = cpu_to_le32((u32)mode);
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
- in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
- dir->i_gid : current_fsgid()));
+ if (dir->i_mode & S_ISGID) {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
+
+ /* Directories always inherit the setgid bit. */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
+ !in_group_p(dir->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID))
+ mode &= ~S_ISGID;
+ } else {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ }
+ in.mode = cpu_to_le32((u32)mode);
+
in.nlink = cpu_to_le32(1);
in.max_size = cpu_to_le64(lo->stripe_unit);
ceph_file_layout_to_legacy(lo, &in.layout);
+ /* lo is private, so pool_ns can't change */
+ pool_ns = rcu_dereference_raw(lo->pool_ns);
+ if (pool_ns) {
+ iinfo.pool_ns_len = pool_ns->len;
+ iinfo.pool_ns_data = pool_ns->str;
+ }
down_read(&mdsc->snap_rwsem);
ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
@@ -687,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
@@ -732,8 +798,10 @@ retry:
restore_deleg_ino(dir, req->r_deleg_ino);
ceph_mdsc_put_request(req);
try_async = false;
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
goto retry;
}
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
goto out_req;
}
}
@@ -808,6 +876,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
ceph_put_fmode(ci, fi->fmode, 1);
kmem_cache_free(ceph_file_cachep, fi);
@@ -845,6 +914,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t ret;
u64 off = iocb->ki_pos;
u64 len = iov_iter_count(to);
+ u64 i_size = i_size_read(inode);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -868,7 +938,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
struct page **pages;
int num_pages;
size_t page_off;
- u64 i_size;
bool more;
int idx;
size_t left;
@@ -951,11 +1020,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
}
if (off > iocb->ki_pos) {
- if (ret >= 0 &&
- iov_iter_count(to) > 0 && off >= i_size_read(inode))
+ if (off >= i_size) {
*retry_op = CHECK_EOF;
- ret = off - iocb->ki_pos;
- iocb->ki_pos = off;
+ ret = i_size - iocb->ki_pos;
+ iocb->ki_pos = i_size;
+ } else {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
}
dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
@@ -1010,7 +1082,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1022,7 +1093,7 @@ static void ceph_aio_complete(struct inode *inode,
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
- aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
@@ -1201,7 +1272,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
snapc, snapc ? snapc->seq : 0);
if (write) {
- int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
+ int ret2;
+
+ ceph_fscache_invalidate(inode, true);
+
+ ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
@@ -1412,6 +1487,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ret < 0)
return ret;
+ ceph_fscache_invalidate(inode, false);
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
@@ -1519,25 +1595,29 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct ceph_inode_info *ci = ceph_inode(inode);
bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
- int want, got = 0;
+ int want = 0, got = 0;
int retry_op = 0, read = 0;
again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_read(inode);
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_CACHE;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_CACHE;
+ want |= CEPH_CAP_FILE_LAZYIO;
+
ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
if (ret < 0) {
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
@@ -1671,13 +1751,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
- int err, want, got;
+ int err, want = 0, got;
bool direct_lock = false;
u32 map_flags;
u64 pool_flags;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
@@ -1735,18 +1818,12 @@ retry_snap:
if (err)
goto out;
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- err = ceph_uninline_data(file, NULL);
- if (err < 0)
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
+ want |= CEPH_CAP_FILE_LAZYIO;
got = 0;
err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
if (err < 0)
@@ -1802,7 +1879,7 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_perform_write(file, from, pos);
+ written = generic_perform_write(iocb, from);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
@@ -1812,7 +1889,6 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2066,12 +2142,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- ret = ceph_uninline_data(file, NULL);
- if (ret < 0)
- goto unlock;
- }
-
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
@@ -2090,12 +2160,12 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
filemap_invalidate_lock(inode->i_mapping);
+ ceph_fscache_invalidate(inode, false);
ceph_zero_pagecache_range(inode, offset, length);
ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2200,6 +2270,54 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
+static struct ceph_osd_request *
+ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
+ u64 src_snapid,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ struct ceph_object_id *dst_oid,
+ struct ceph_object_locator *dst_oloc,
+ u32 truncate_seq, u64 truncate_size)
+{
+ struct ceph_osd_request *req;
+ int ret;
+ u32 src_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
+ u32 dst_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+
+ ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
+ ceph_oid_copy(&req->r_t.base_oid, dst_oid);
+
+ ret = osd_req_op_copy_from_init(req, src_snapid, 0,
+ src_oid, src_oloc,
+ src_fadvise_flags,
+ dst_fadvise_flags,
+ truncate_seq,
+ truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret)
+ goto out;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ return req;
+
+out:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(ret);
+}
+
static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
struct ceph_inode_info *dst_ci, u64 *dst_off,
struct ceph_fs_client *fsc,
@@ -2207,6 +2325,8 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
{
struct ceph_object_locator src_oloc, dst_oloc;
struct ceph_object_id src_oid, dst_oid;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *req;
size_t bytes = 0;
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
@@ -2217,6 +2337,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
dst_oloc.pool = dst_ci->i_layout.pool_id;
dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+ osdc = &fsc->client->osdc;
while (len >= object_size) {
ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
@@ -2232,17 +2353,22 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
ceph_oid_printf(&dst_oid, "%llx.%08llx",
dst_ci->i_vino.ino, dst_objnum);
/* Do an object remote copy */
- ret = ceph_osdc_copy_from(&fsc->client->osdc,
- src_ci->i_vino.snap, 0,
- &src_oid, &src_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
- &dst_oid, &dst_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
- dst_ci->i_truncate_seq,
- dst_ci->i_truncate_size,
- CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
+ &src_oid, &src_oloc,
+ &dst_oid, &dst_oloc,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size);
+ if (IS_ERR(req))
+ ret = PTR_ERR(req);
+ else {
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ object_size, ret);
+ ceph_osdc_put_request(req);
+ }
if (ret) {
if (ret == -EOPNOTSUPP) {
fsc->have_copy_from2 = false;
@@ -2358,6 +2484,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
goto out_caps;
/* Drop dst file cached pages */
+ ceph_fscache_invalidate(dst_inode, false);
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
(dst_off + len) >> PAGE_SHIFT);
@@ -2431,7 +2558,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
- dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1c7574105478..63113e2a4890 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent)
if (!S_ISDIR(parent->i_mode)) {
pr_warn_once("bad snapdir parent type (mode=0%o)\n",
parent->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once("bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
inode->i_mode = parent->i_mode;
@@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -447,12 +453,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
dout("alloc_inode %p\n", &ci->vfs_inode);
+ /* Set parameters for the netfs library */
+ netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops);
+
spin_lock_init(&ci->i_ceph_lock);
ci->i_version = 0;
@@ -538,9 +547,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
-
- ceph_fscache_inode_init(ci);
-
return &ci->vfs_inode;
}
@@ -564,6 +570,8 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
+ if (inode->i_state & I_PINNING_FSCACHE_WB)
+ ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
@@ -634,6 +642,12 @@ int ceph_fill_file_size(struct inode *inode, int issued,
}
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
+ /*
+ * If we're expanding, then we should be able to just update
+ * the existing cookie.
+ */
+ if (size > isize)
+ ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -666,10 +680,6 @@ int ceph_fill_file_size(struct inode *inode, int issued,
truncate_size);
ci->i_truncate_size = truncate_size;
}
-
- if (queue_trunc)
- ceph_fscache_invalidate(inode);
-
return queue_trunc;
}
@@ -1053,6 +1063,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_register_inode_cookie(inode);
+
if (fill_inline)
ceph_fill_inline_data(inode, locked_page,
iinfo->inline_data, iinfo->inline_len);
@@ -1195,7 +1207,7 @@ out_unlock:
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1592,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -1814,11 +1826,13 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
+ ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
+
return ret;
}
@@ -1841,15 +1855,16 @@ void ceph_queue_inode_work(struct inode *inode, int work_bit)
static void ceph_do_invalidate_pages(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen;
int check = 0;
+ ceph_fscache_invalidate(inode, false);
+
mutex_lock(&ci->i_truncate_mutex);
- if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
- pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
- inode, ceph_ino(inode));
+ if (ceph_inode_is_shutdown(inode)) {
+ pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
+ __func__, ceph_vinop(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
@@ -1869,9 +1884,10 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode);
+ ceph_fscache_invalidate(inode, false);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
- pr_err("invalidate_pages %p fails\n", inode);
+ pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
}
spin_lock(&ci->i_ceph_lock);
@@ -1937,6 +1953,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_resize(inode, to);
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
@@ -2103,12 +2120,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
loff_t isize = i_size_read(inode);
dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) {
- i_size_write(inode, attr->ia_size);
- inode->i_blocks = calc_inode_blocks(attr->ia_size);
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- ia_valid |= ATTR_MTIME;
+ if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ if (attr->ia_size > isize) {
+ i_size_write(inode, attr->ia_size);
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ ia_valid |= ATTR_MTIME;
+ }
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
attr->ia_size != isize) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
@@ -2182,7 +2201,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
-
if (mask) {
req->r_inode = inode;
ihold(inode);
@@ -2217,6 +2235,9 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
err = setattr_prepare(&init_user_ns, dentry, attr);
if (err != 0)
return err;
@@ -2286,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
@@ -2347,6 +2419,9 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
/* Skip the getattr altogether if we're asked not to sync */
if (!(flags & AT_STATX_DONT_SYNC)) {
err = ceph_do_getattr(inode,
@@ -2394,3 +2469,27 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->result_mask = request_mask & valid_mask;
return err;
}
+
+void ceph_inode_shutdown(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+ int iputs = 0;
+ bool invalidate = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+
+ p = rb_next(p);
+ iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ while (iputs--)
+ iput(inode);
+}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index bdeb271f47d9..3e2843e86e27 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- if (wait)
- req->r_wait_for_completion = ceph_lock_wait_for_completion;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
@@ -241,6 +241,9 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
@@ -302,9 +305,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
dout("ceph_flock, fl_file: %p\n", fl->fl_file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d64413adc0fd..00c3de177dd6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -555,6 +555,28 @@ bad:
return -EIO;
}
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
+bad:
+ return -EIO;
+}
+
/*
* parse extra results
*/
@@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end,
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -1590,129 +1614,23 @@ out:
return ret;
}
-static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap_snap *capsnap;
- int capsnap_release = 0;
-
- lockdep_assert_held(&ci->i_ceph_lock);
-
- dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
-
- while (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap, ci_item);
- __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
- ceph_put_snap_context(capsnap->context);
- ceph_put_cap_snap(capsnap);
- capsnap_release++;
- }
- wake_up_all(&ci->i_cap_wq);
- wake_up_all(&mdsc->cap_flushing_wq);
- return capsnap_release;
-}
-
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
- struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
- struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- LIST_HEAD(to_remove);
- bool dirty_dropped = false;
bool invalidate = false;
- int capsnap_release = 0;
+ int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- __ceph_remove_cap(cap, false);
- if (!ci->i_auth_cap) {
- struct ceph_cap_flush *cf;
-
- if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
- if (inode->i_data.nrpages > 0)
- invalidate = true;
- if (ci->i_wrbuffer_ref > 0)
- mapping_set_error(&inode->i_data, -EIO);
- }
-
- while (!list_empty(&ci->i_cap_flush_list)) {
- cf = list_first_entry(&ci->i_cap_flush_list,
- struct ceph_cap_flush, i_list);
- list_move(&cf->i_list, &to_remove);
- }
-
- spin_lock(&mdsc->cap_dirty_lock);
-
- list_for_each_entry(cf, &to_remove, i_list)
- list_del_init(&cf->g_list);
-
- if (!list_empty(&ci->i_dirty_item)) {
- pr_warn_ratelimited(
- " dropping dirty %s state for %p %lld\n",
- ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
- ci->i_dirty_caps = 0;
- list_del_init(&ci->i_dirty_item);
- dirty_dropped = true;
- }
- if (!list_empty(&ci->i_flushing_item)) {
- pr_warn_ratelimited(
- " dropping dirty+flushing %s state for %p %lld\n",
- ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
- ci->i_flushing_caps = 0;
- list_del_init(&ci->i_flushing_item);
- mdsc->num_cap_flushing--;
- dirty_dropped = true;
- }
- spin_unlock(&mdsc->cap_dirty_lock);
-
- if (dirty_dropped) {
- mapping_set_error(inode->i_mapping, -EIO);
-
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- }
- }
-
- if (atomic_read(&ci->i_filelock_ref) > 0) {
- /* make further file lock syscall return -EIO */
- ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
- pr_warn_ratelimited(" dropping file locks for %p %lld\n",
- inode, ceph_ino(inode));
- }
-
- if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
- ci->i_prealloc_cap_flush = NULL;
- }
-
- if (!list_empty(&ci->i_cap_snaps))
- capsnap_release = remove_capsnaps(mdsc, inode);
- }
+ iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
- while (!list_empty(&to_remove)) {
- struct ceph_cap_flush *cf;
- cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, i_list);
- list_del_init(&cf->i_list);
- if (!cf->is_capsnap)
- ceph_free_cap_flush(cf);
- }
wake_up_all(&ci->i_cap_wq);
if (invalidate)
ceph_queue_invalidate(inode);
- if (dirty_dropped)
- iput(inode);
- while (capsnap_release--)
+ while (iputs--)
iput(inode);
return 0;
}
@@ -2284,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
+ __GFP_NOWARN |
+ __GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
@@ -3052,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
-static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
- if (!req->r_timeout && req->r_wait_for_completion) {
- err = req->r_wait_for_completion(mdsc, req);
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
@@ -3117,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
- err = ceph_mdsc_wait_request(mdsc, req);
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
@@ -3203,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu\n", req->r_tid);
- req->r_resend_mds = -1;
- if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now\n");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- int mds = __choose_mds(mdsc, req, NULL);
- if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending\n");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu\n", req->r_tid);
- }
-
-
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
@@ -3467,9 +3358,14 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
- /* version >= 2, metadata */
- if (__decode_session_metadata(&p, end, &blocklisted) < 0)
+ /* version >= 2 and < 5, decode metadata, skip otherwise
+ * as it's handled via flags.
+ */
+ if (msg_version >= 5)
+ ceph_decode_skip_map(&p, end, string, string, bad);
+ else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
goto bad;
+
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
if (len) {
@@ -3478,6 +3374,18 @@ static void handle_session(struct ceph_mds_session *session,
}
}
+ if (msg_version >= 5) {
+ u32 flags;
+ /* version >= 4, struct_v, struct_cv, len, metric_spec */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad);
+ /* version >= 5, flags */
+ ceph_decode_32_safe(&p, end, flags, bad);
+ if (flags & CEPH_SESSION_BLOCKLISTED) {
+ pr_warn("mds%d session blocklisted\n", session->s_mds);
+ blocklisted = true;
+ }
+ }
+
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
ceph_get_mds_session(session);
@@ -3772,7 +3680,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
char *path;
- int pathlen, err;
+ int pathlen = 0, err;
u64 pathbase;
u64 snap_follows;
@@ -3792,7 +3700,6 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
} else {
path = NULL;
- pathlen = 0;
pathbase = 0;
}
@@ -4527,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4537,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
@@ -4931,7 +4832,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -5072,7 +4973,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
bad:
- pr_err("error decoding fsmap\n");
+ pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
+ ceph_umount_begin(mdsc->fsc->sb);
err_out:
mutex_lock(&mdsc->mutex);
mdsc->mdsmap_err = err;
@@ -5139,7 +5041,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
bad_unlock:
mutex_unlock(&mdsc->mutex);
bad:
- pr_err("error decoding mdsmap %d\n", err);
+ pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
+ ceph_umount_begin(mdsc->fsc->sb);
return;
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 97c7f7bfa55f..33497846e47e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -274,8 +280,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- const struct cred *r_cred;
int r_request_release_offset;
+ const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -296,12 +302,11 @@ struct ceph_mds_request {
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
-
+ u32 r_readdir_offset;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
- u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
@@ -329,7 +334,6 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
- ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
@@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 61d67cbcb367..30387733765d 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -263,10 +263,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
goto nomem;
for (j = 0; j < num_export_targets; j++) {
target = ceph_decode_32(&pexport_targets);
- if (target >= m->possible_max_rank) {
- err = -EIO;
- goto corrupt;
- }
info->export_targets[j] = target;
}
} else {
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 04d5df29bbbf..c47347d2e84e 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -8,6 +8,12 @@
#include "metric.h"
#include "mds_client.h"
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
@@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
- struct timespec64 ts;
s64 sum;
s32 items = 0;
s32 len;
@@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
- read->header.ver = 1;
+ read->header.ver = 2;
read->header.compat = 1;
read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
- sum = m->read_latency_sum;
- jiffies_to_timespec64(sum, &ts);
- read->sec = cpu_to_le32(ts.tv_sec);
- read->nsec = cpu_to_le32(ts.tv_nsec);
+ sum = m->metric[METRIC_READ].latency_sum;
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
items++;
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
- write->header.ver = 1;
+ write->header.ver = 2;
write->header.compat = 1;
write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
- sum = m->write_latency_sum;
- jiffies_to_timespec64(sum, &ts);
- write->sec = cpu_to_le32(ts.tv_sec);
- write->nsec = cpu_to_le32(ts.tv_nsec);
+ sum = m->metric[METRIC_WRITE].latency_sum;
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
items++;
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
- meta->header.ver = 1;
+ meta->header.ver = 2;
meta->header.compat = 1;
meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
- sum = m->metadata_latency_sum;
- jiffies_to_timespec64(sum, &ts);
- meta->sec = cpu_to_le32(ts.tv_sec);
- meta->nsec = cpu_to_le32(ts.tv_nsec);
+ sum = m->metric[METRIC_METADATA].latency_sum;
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
items++;
/* encode the dentry lease metric */
@@ -141,8 +149,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
rsize->header.ver = 1;
rsize->header.compat = 1;
rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len);
- rsize->total_ops = cpu_to_le64(m->total_reads);
- rsize->total_size = cpu_to_le64(m->read_size_sum);
+ rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total);
+ rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum);
items++;
/* encode the write io size metric */
@@ -151,8 +159,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
wsize->header.ver = 1;
wsize->header.compat = 1;
wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len);
- wsize->total_ops = cpu_to_le64(m->total_writes);
- wsize->total_size = cpu_to_le64(m->write_size_sum);
+ wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total);
+ wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
items++;
put_unaligned_le32(items, &head->num);
@@ -160,8 +168,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
msg->hdr.version = cpu_to_le16(1);
msg->hdr.compat_version = cpu_to_le16(1);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("client%llu send metrics to mds%d\n",
- ceph_client_gid(mdsc->fsc->client), s->s_mds);
ceph_con_send(&s->s_con, msg);
return true;
@@ -220,7 +226,8 @@ static void metric_delayed_work(struct work_struct *work)
int ceph_metric_init(struct ceph_client_metric *m)
{
- int ret;
+ struct ceph_metric *metric;
+ int ret, i;
if (!m)
return -EINVAL;
@@ -243,32 +250,19 @@ int ceph_metric_init(struct ceph_client_metric *m)
if (ret)
goto err_i_caps_mis;
- spin_lock_init(&m->read_metric_lock);
- m->read_latency_sq_sum = 0;
- m->read_latency_min = KTIME_MAX;
- m->read_latency_max = 0;
- m->total_reads = 0;
- m->read_latency_sum = 0;
- m->read_size_min = U64_MAX;
- m->read_size_max = 0;
- m->read_size_sum = 0;
-
- spin_lock_init(&m->write_metric_lock);
- m->write_latency_sq_sum = 0;
- m->write_latency_min = KTIME_MAX;
- m->write_latency_max = 0;
- m->total_writes = 0;
- m->write_latency_sum = 0;
- m->write_size_min = U64_MAX;
- m->write_size_max = 0;
- m->write_size_sum = 0;
-
- spin_lock_init(&m->metadata_metric_lock);
- m->metadata_latency_sq_sum = 0;
- m->metadata_latency_min = KTIME_MAX;
- m->metadata_latency_max = 0;
- m->total_metadatas = 0;
- m->metadata_latency_sum = 0;
+ for (i = 0; i < METRIC_MAX; i++) {
+ metric = &m->metric[i];
+ spin_lock_init(&metric->lock);
+ metric->size_sum = 0;
+ metric->size_min = U64_MAX;
+ metric->size_max = 0;
+ metric->total = 0;
+ metric->latency_sum = 0;
+ metric->latency_avg = 0;
+ metric->latency_sq_sum = 0;
+ metric->latency_min = KTIME_MAX;
+ metric->latency_max = 0;
+ }
atomic64_set(&m->opened_files, 0);
ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
@@ -322,25 +316,24 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
max = new; \
}
-static inline void __update_stdev(ktime_t total, ktime_t lsum,
- ktime_t *sq_sump, ktime_t lat)
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
{
- ktime_t avg, sq;
-
- if (unlikely(total == 1))
- return;
-
- /* the sq is (lat - old_avg) * (lat - new_avg) */
- avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1));
- sq = lat - avg;
- avg = DIV64_U64_ROUND_CLOSEST(lsum, total);
- sq = sq * (lat - avg);
- *sq_sump += sq;
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
}
-void ceph_update_read_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- unsigned int size, int rc)
+void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
{
ktime_t lat = ktime_sub(r_end, r_start);
ktime_t total;
@@ -348,63 +341,13 @@ void ceph_update_read_metrics(struct ceph_client_metric *m,
if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
return;
- spin_lock(&m->read_metric_lock);
- total = ++m->total_reads;
- m->read_size_sum += size;
- m->read_latency_sum += lat;
- METRIC_UPDATE_MIN_MAX(m->read_size_min,
- m->read_size_max,
- size);
- METRIC_UPDATE_MIN_MAX(m->read_latency_min,
- m->read_latency_max,
- lat);
- __update_stdev(total, m->read_latency_sum,
- &m->read_latency_sq_sum, lat);
- spin_unlock(&m->read_metric_lock);
-}
-
-void ceph_update_write_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- unsigned int size, int rc)
-{
- ktime_t lat = ktime_sub(r_end, r_start);
- ktime_t total;
-
- if (unlikely(rc && rc != -ETIMEDOUT))
- return;
-
- spin_lock(&m->write_metric_lock);
- total = ++m->total_writes;
- m->write_size_sum += size;
- m->write_latency_sum += lat;
- METRIC_UPDATE_MIN_MAX(m->write_size_min,
- m->write_size_max,
- size);
- METRIC_UPDATE_MIN_MAX(m->write_latency_min,
- m->write_latency_max,
- lat);
- __update_stdev(total, m->write_latency_sum,
- &m->write_latency_sq_sum, lat);
- spin_unlock(&m->write_metric_lock);
-}
-
-void ceph_update_metadata_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- int rc)
-{
- ktime_t lat = ktime_sub(r_end, r_start);
- ktime_t total;
-
- if (unlikely(rc && rc != -ENOENT))
- return;
-
- spin_lock(&m->metadata_metric_lock);
- total = ++m->total_metadatas;
- m->metadata_latency_sum += lat;
- METRIC_UPDATE_MIN_MAX(m->metadata_latency_min,
- m->metadata_latency_max,
- lat);
- __update_stdev(total, m->metadata_latency_sum,
- &m->metadata_latency_sq_sum, lat);
- spin_unlock(&m->metadata_metric_lock);
+ spin_lock(&m->lock);
+ total = ++m->total;
+ m->size_sum += size;
+ METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
+ m->latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
+ spin_unlock(&m->lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 0133955a3c6a..0d0c44bd3332 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -2,7 +2,7 @@
#ifndef _FS_CEPH_MDS_METRIC_H
#define _FS_CEPH_MDS_METRIC_H
-#include <linux/types.h>
+#include <linux/ceph/types.h>
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
@@ -19,27 +19,39 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
-
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
/*
* This will always have the highest metric bit value
* as the last element of the array.
*/
-#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
- CLIENT_METRIC_TYPE_CAP_INFO, \
- CLIENT_METRIC_TYPE_READ_LATENCY, \
- CLIENT_METRIC_TYPE_WRITE_LATENCY, \
- CLIENT_METRIC_TYPE_METADATA_LATENCY, \
- CLIENT_METRIC_TYPE_DENTRY_LEASE, \
- CLIENT_METRIC_TYPE_OPENED_FILES, \
- CLIENT_METRIC_TYPE_PINNED_ICAPS, \
- CLIENT_METRIC_TYPE_OPENED_INODES, \
- CLIENT_METRIC_TYPE_READ_IO_SIZES, \
- CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
- \
- CLIENT_METRIC_TYPE_MAX, \
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
}
struct ceph_metric_header {
@@ -60,22 +72,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric dentry lease header */
@@ -125,6 +143,27 @@ struct ceph_metric_head {
__le32 num; /* the number of metrics that will be sent */
} __packed;
+enum metric_type {
+ METRIC_READ,
+ METRIC_WRITE,
+ METRIC_METADATA,
+ METRIC_COPYFROM,
+ METRIC_MAX
+};
+
+struct ceph_metric {
+ spinlock_t lock;
+ u64 total;
+ u64 size_sum;
+ u64 size_min;
+ u64 size_max;
+ ktime_t latency_sum;
+ ktime_t latency_avg;
+ ktime_t latency_sq_sum;
+ ktime_t latency_min;
+ ktime_t latency_max;
+};
+
/* This is the global metrics */
struct ceph_client_metric {
atomic64_t total_dentries;
@@ -135,32 +174,7 @@ struct ceph_client_metric {
struct percpu_counter i_caps_hit;
struct percpu_counter i_caps_mis;
- spinlock_t read_metric_lock;
- u64 total_reads;
- u64 read_size_sum;
- u64 read_size_min;
- u64 read_size_max;
- ktime_t read_latency_sum;
- ktime_t read_latency_sq_sum;
- ktime_t read_latency_min;
- ktime_t read_latency_max;
-
- spinlock_t write_metric_lock;
- u64 total_writes;
- u64 write_size_sum;
- u64 write_size_min;
- u64 write_size_max;
- ktime_t write_latency_sum;
- ktime_t write_latency_sq_sum;
- ktime_t write_latency_min;
- ktime_t write_latency_max;
-
- spinlock_t metadata_metric_lock;
- u64 total_metadatas;
- ktime_t metadata_latency_sum;
- ktime_t metadata_latency_sq_sum;
- ktime_t metadata_latency_min;
- ktime_t metadata_latency_max;
+ struct ceph_metric metric[METRIC_MAX];
/* The total number of directories and files that are opened */
atomic64_t opened_files;
@@ -195,13 +209,36 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
percpu_counter_inc(&m->i_caps_mis);
}
-extern void ceph_update_read_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- unsigned int size, int rc);
-extern void ceph_update_write_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- unsigned int size, int rc);
-extern void ceph_update_metadata_metrics(struct ceph_client_metric *m,
- ktime_t r_start, ktime_t r_end,
- int rc);
+extern void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc);
+
+static inline void ceph_update_read_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_READ],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_write_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_WRITE],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_metadata_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_METADATA],
+ r_start, r_end, 0, rc);
+}
+static inline void ceph_update_copyfrom_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_COPYFROM],
+ r_start, r_end, size, rc);
+}
#endif /* _FS_CEPH_MDS_METRIC_H */
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 620c691af40e..a338a3ec0dc4 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
/* if root is the real CephFS root, we don't have quota realms */
if (root && ceph_ino(root) == CEPH_INO_ROOT)
return false;
+ /* MDS stray dirs have no quota realms */
+ if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino))
+ return false;
/* otherwise, we can't know for sure */
return true;
}
@@ -494,10 +497,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
if (ci->i_max_bytes) {
total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* For quota size less than 4MB, use 4KB block size */
+ if (!total) {
+ total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
/* It is possible for a quota to be exceeded.
* Report 'zero' in that case
*/
free = total > used ? total - used : 0;
+ /* For quota size less than 4KB, report the
+ * total=used=4KB,free=0 when quota is full
+ * and total=free=4KB, used=0 otherwise */
+ if (!total) {
+ total = 1;
+ free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
}
spin_unlock(&ci->i_ceph_lock);
if (total) {
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b41e6724c591..322ee5add942 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 1); /* for caller */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ dout("%s %llx %p\n", __func__, realm->ino, realm);
return realm;
}
@@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ dout("%s %llx %p\n", __func__, r->ino, r);
return r;
}
}
@@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ dout("%s %p %llx\n", __func__, realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
+ realm, realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm,
- struct list_head* dirty_realms)
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm,
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent, dirty_realms);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
+ dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ __func__, realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int)realm->cached_context->num_snaps);
return 0;
@@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
+ realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -409,8 +417,7 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
return err;
}
@@ -420,13 +427,50 @@ fail:
static void rebuild_snap_realms(struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
+
+ last = build_snap_context(_realm, &realm_queue, dirty_realms);
+ dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm, dirty_realms);
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child, dirty_realms);
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
- capsnap->cap_flush.is_capsnap = true;
- INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
- INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
+ dout("%s %p %llx.%llx already pending\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ dout("%s %p %llx.%llx nothing dirty|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
@@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("queue_cap_snap %p "
- "no new_snap|dirty_page|writing\n", inode);
+ dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
- inode, capsnap, old_snapc, ceph_cap_string(dirty),
- capsnap->need_flush ? "" : "no_flush");
+ dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- refcount_set(&capsnap->nref, 1);
- INIT_LIST_HEAD(&capsnap->ci_item);
-
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
@@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", __func__, inode, ceph_vinop(inode),
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
- capsnap = NULL;
+ *pcapsnap = NULL;
old_snapc = NULL;
update_snapc:
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ci->i_head_snapc = NULL;
- } else {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob);
- kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
@@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
/* Fb cap still in use, delay it */
if (ci->i_wb_ref) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
capsnap->writing = 1;
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
@@ -671,8 +706,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ dout("%s %p %llx inode\n", __func__, realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -682,13 +718,35 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ dout("%s %p %llx done\n", __func__, realm, realm->ino);
}
/*
@@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm = NULL;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("update_snap_trace deletion=%d\n", deletion);
+ dout("%s deletion=%d\n", __func__, deletion);
more:
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -738,10 +798,10 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ dout("%s updating %llx %p %lld -> %lld\n", __func__,
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
@@ -763,22 +823,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
+ dout("%s %llx %p seq %lld new\n", __func__,
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ dout("%s %llx %p seq %lld unchanged\n", __func__,
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -814,7 +882,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("update_snap_trace error %d\n", err);
+ pr_err("%s error %d\n", __func__, err);
return err;
}
@@ -831,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ dout("%s\n", __func__);
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
@@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("flush_snaps done\n");
+ dout("%s done\n", __func__);
}
/**
@@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
+ dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
+ mds, ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
inc_session_sequence(session);
@@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
+ dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
+ dout(" will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1038,7 +1106,7 @@ skip_inode:
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
@@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
- dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ dout("%s create snapid map %llx -> %x\n", __func__,
+ sm->snap, sm->dev);
return sm;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 573bb9556fb5..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fd8742bae847..e6987d295079 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#include <uapi/linux/magic.h>
+
static DEFINE_SPINLOCK(ceph_fsc_lock);
static LIST_HEAD(ceph_fsc_list);
@@ -52,8 +54,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
struct ceph_mon_client *monc = &fsc->client->monc;
struct ceph_statfs st;
- u64 fsid;
- int err;
+ int i, err;
u64 data_pool;
if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
@@ -99,12 +100,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = NAME_MAX;
/* Must convert the fsid, for consistent values across arches */
+ buf->f_fsid.val[0] = 0;
mutex_lock(&monc->mutex);
- fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
- le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
+ for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i)
+ buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]);
mutex_unlock(&monc->mutex);
- buf->f_fsid = u64_to_fsid(fsid);
+ /* fold the fs_cluster_id into the upper bits */
+ buf->f_fsid.val[1] = monc->fs_cluster_id;
return 0;
}
@@ -145,6 +148,7 @@ enum {
Opt_mds_namespace,
Opt_recover_session,
Opt_source,
+ Opt_mon_addr,
/* string args above */
Opt_dirstat,
Opt_rbytes,
@@ -158,6 +162,7 @@ enum {
Opt_quotadf,
Opt_copyfrom,
Opt_wsync,
+ Opt_pagecache,
};
enum ceph_recover_session_mode {
@@ -196,8 +201,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_u32 ("rsize", Opt_rsize),
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
+ fsparam_string ("mon_addr", Opt_mon_addr),
fsparam_u32 ("wsize", Opt_wsize),
fsparam_flag_no ("wsync", Opt_wsync),
+ fsparam_flag_no ("pagecache", Opt_pagecache),
{}
};
@@ -227,9 +234,92 @@ static void canonicalize_path(char *path)
}
/*
- * Parse the source parameter. Distinguish the server list from the path.
+ * Check if the mds namespace in ceph_mount_options matches
+ * the passed in namespace string. First time match (when
+ * ->mds_namespace is NULL) is treated specially, since
+ * ->mds_namespace needs to be initialized by the caller.
+ */
+static int namespace_equals(struct ceph_mount_options *fsopt,
+ const char *namespace, size_t len)
+{
+ return !(fsopt->mds_namespace &&
+ (strlen(fsopt->mds_namespace) != len ||
+ strncmp(fsopt->mds_namespace, namespace, len)));
+}
+
+static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ int r;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ if (*dev_name_end != ':')
+ return invalfc(fc, "separator ':' missing in source");
+
+ r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name,
+ pctx->copts, fc->log.log, ',');
+ if (r)
+ return r;
+
+ fsopt->new_dev_syntax = false;
+ return 0;
+}
+
+static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ size_t len;
+ struct ceph_fsid fsid;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ char *fsid_start, *fs_name_start;
+
+ if (*dev_name_end != '=') {
+ dout("separator '=' missing in source");
+ return -EINVAL;
+ }
+
+ fsid_start = strchr(dev_name, '@');
+ if (!fsid_start)
+ return invalfc(fc, "missing cluster fsid");
+ ++fsid_start; /* start of cluster fsid */
+
+ fs_name_start = strchr(fsid_start, '.');
+ if (!fs_name_start)
+ return invalfc(fc, "missing file system name");
+
+ if (ceph_parse_fsid(fsid_start, &fsid))
+ return invalfc(fc, "Invalid FSID");
+
+ ++fs_name_start; /* start of file system name */
+ len = dev_name_end - fs_name_start;
+
+ if (!namespace_equals(fsopt, fs_name_start, len))
+ return invalfc(fc, "Mismatching mds_namespace");
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace);
+
+ fsopt->new_dev_syntax = true;
+ return 0;
+}
+
+/*
+ * Parse the source parameter for new device format. Distinguish the device
+ * spec from the path. Try parsing new device format and fallback to old
+ * format if needed.
*
- * The source will look like:
+ * New device syntax will looks like:
+ * <device_spec>=/<path>
+ * where
+ * <device_spec> is name@fsid.fsname
+ * <path> is optional, but if present must begin with '/'
+ * (monitor addresses are passed via mount option)
+ *
+ * Old device syntax is:
* <server_spec>[,<server_spec>...]:[<path>]
* where
* <server_spec> is <ip>[:<port>]
@@ -262,24 +352,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = dev_name + strlen(dev_name);
}
- dev_name_end--; /* back up to ':' separator */
- if (dev_name_end < dev_name || *dev_name_end != ':')
- return invalfc(fc, "No path or : separator in source");
+ dev_name_end--; /* back up to separator */
+ if (dev_name_end < dev_name)
+ return invalfc(fc, "Path missing in source");
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
if (fsopt->server_path)
dout("server path '%s'\n", fsopt->server_path);
- ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name,
- pctx->copts, fc->log.log);
- if (ret)
- return ret;
+ dout("trying new device syntax");
+ ret = ceph_parse_new_source(dev_name, dev_name_end, fc);
+ if (ret) {
+ if (ret != -EINVAL)
+ return ret;
+ dout("trying old device syntax");
+ ret = ceph_parse_old_source(dev_name, dev_name_end, fc);
+ if (ret)
+ return ret;
+ }
fc->source = param->string;
param->string = NULL;
return 0;
}
+static int ceph_parse_mon_addr(struct fs_parameter *param,
+ struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ kfree(fsopt->mon_addr);
+ fsopt->mon_addr = param->string;
+ param->string = NULL;
+
+ return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr),
+ pctx->copts, fc->log.log, '/');
+}
+
static int ceph_parse_mount_param(struct fs_context *fc,
struct fs_parameter *param)
{
@@ -305,6 +415,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
param->string = NULL;
break;
case Opt_mds_namespace:
+ if (!namespace_equals(fsopt, param->string, strlen(param->string)))
+ return invalfc(fc, "Mismatching mds_namespace");
kfree(fsopt->mds_namespace);
fsopt->mds_namespace = param->string;
param->string = NULL;
@@ -322,6 +434,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
if (fc->source)
return invalfc(fc, "Multiple sources specified");
return ceph_parse_source(param, fc);
+ case Opt_mon_addr:
+ return ceph_parse_mon_addr(param, fc);
case Opt_wsize:
if (result.uint_32 < PAGE_SIZE ||
result.uint_32 > CEPH_MAX_WRITE_SIZE)
@@ -454,6 +568,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
else
fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
break;
+ case Opt_pagecache:
+ if (result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
+ break;
default:
BUG();
}
@@ -473,6 +593,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args->fscache_uniq);
+ kfree(args->mon_addr);
kfree(args);
}
@@ -516,6 +637,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr);
+ if (ret)
+ return ret;
+
return ceph_compare_options(new_opt, fsc->client);
}
@@ -571,14 +696,21 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
seq_puts(m, ",copyfrom");
- if (fsopt->mds_namespace)
+ /* dump mds_namespace when old device syntax is in use */
+ if (fsopt->mds_namespace && !fsopt->new_dev_syntax)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+ if (fsopt->mon_addr)
+ seq_printf(m, ",mon_addr=%s", fsopt->mon_addr);
+
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
- if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
- seq_puts(m, ",nowsync");
+ if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
+ seq_puts(m, ",wsync");
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ seq_puts(m, ",nopagecache");
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
@@ -670,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
+ fsc->write_congested = false;
err = -ENOMEM;
/*
@@ -732,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
@@ -760,6 +894,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
@@ -786,16 +923,10 @@ static int __init init_caches(void)
if (!ceph_wb_pagevec_pool)
goto bad_pagevec_pool;
- error = ceph_fscache_register();
- if (error)
- goto bad_fscache;
-
return 0;
-bad_fscache:
- kmem_cache_destroy(ceph_mds_request_cachep);
bad_pagevec_pool:
- mempool_destroy(ceph_wb_pagevec_pool);
+ kmem_cache_destroy(ceph_mds_request_cachep);
bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
@@ -805,6 +936,8 @@ bad_file:
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -821,14 +954,13 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
kmem_cache_destroy(ceph_mds_request_cachep);
mempool_destroy(ceph_wb_pagevec_pool);
-
- ceph_fscache_unregister();
}
static void __ceph_umount_begin(struct ceph_fs_client *fsc)
@@ -842,7 +974,7 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc)
* ceph_umount_begin - initiate forced umount. Tear down the
* mount, skipping steps that may hang while waiting for server(s).
*/
-static void ceph_umount_begin(struct super_block *sb)
+void ceph_umount_begin(struct super_block *sb)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
@@ -1059,6 +1191,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
static int ceph_get_tree(struct fs_context *fc)
{
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb;
struct ceph_fs_client *fsc;
struct dentry *res;
@@ -1070,6 +1203,8 @@ static int ceph_get_tree(struct fs_context *fc)
if (!fc->source)
return invalfc(fc, "No source");
+ if (fsopt->new_dev_syntax && !fsopt->mon_addr)
+ return invalfc(fc, "No monitor address");
/* create client (which we may/may not use) */
fsc = create_fs_client(pctx->opts, pctx->copts);
@@ -1155,6 +1290,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
else
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+ if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
+ kfree(fsc->mount_options->mon_addr);
+ fsc->mount_options->mon_addr = fsopt->mon_addr;
+ fsopt->mon_addr = NULL;
+ pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+ }
+
sync_filesystem(fc->root->d_sb);
return 0;
}
@@ -1332,6 +1474,14 @@ bool disable_send_metrics = false;
module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+/* for both v1 and v2 syntax */
+static bool mount_support = true;
+static const struct kernel_param_ops param_ops_mount_syntax = {
+ .get = param_get_bool,
+};
+module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
+module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 14f951cd5b61..20ceab74e871 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -17,21 +17,16 @@
#include <linux/posix_acl.h>
#include <linux/refcount.h>
#include <linux/security.h>
-
-#include <linux/ceph/libceph.h>
-
-#ifdef CONFIG_CEPH_FSCACHE
-#define FSCACHE_USE_NEW_IO_API
+#include <linux/netfs.h>
#include <linux/fscache.h>
-#endif
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
+#include <linux/ceph/libceph.h>
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -45,10 +40,12 @@
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
+#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
- CEPH_MOUNT_OPT_NOCOPYFROM)
+ CEPH_MOUNT_OPT_NOCOPYFROM | \
+ CEPH_MOUNT_OPT_ASYNC_DIROPS)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
@@ -88,6 +85,8 @@ struct ceph_mount_options {
unsigned int max_readdir; /* max readdir result (entries) */
unsigned int max_readdir_bytes; /* max readdir result (bytes) */
+ bool new_dev_syntax;
+
/*
* everything above this point can be memcmp'd; everything below
* is handled in compare_mount_options()
@@ -97,6 +96,7 @@ struct ceph_mount_options {
char *mds_namespace; /* default NULL */
char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
+ char *mon_addr;
};
struct ceph_fs_client {
@@ -119,6 +119,7 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
atomic_long_t writeback_count;
+ bool write_congested;
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
@@ -128,13 +129,13 @@ struct ceph_fs_client {
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
- struct dentry *debugfs_metric;
struct dentry *debugfs_status;
struct dentry *debugfs_mds_sessions;
+ struct dentry *debugfs_metrics_dir;
#endif
#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
+ struct fscache_volume *fscache;
#endif
};
@@ -228,7 +229,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -315,6 +316,11 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode;
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -425,11 +431,6 @@ struct ceph_inode_info {
struct work_struct i_work;
unsigned long i_work_mask;
-
-#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
-#endif
- struct inode vfs_inode; /* at end */
};
static inline struct ceph_inode_info *
@@ -534,19 +535,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
*
* These come from src/mds/mdstypes.h in the ceph sources.
*/
-#define CEPH_MAX_MDS 0x100
-#define CEPH_NUM_STRAY 10
+#define CEPH_MAX_MDS 0x100
+#define CEPH_NUM_STRAY 10
#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS)
+#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS)
#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
{
- if (vino.ino < CEPH_INO_SYSTEM_BASE &&
- vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) {
- WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino);
- return true;
- }
- return false;
+ if (vino.ino >= CEPH_INO_SYSTEM_BASE ||
+ vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET)
+ return false;
+
+ /* Don't warn on mdsdirs */
+ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET,
+ "Attempt to access reserved inode number 0x%llx",
+ vino.ino);
+ return true;
}
static inline struct inode *ceph_find_inode(struct super_block *sb,
@@ -580,6 +585,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
+#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
/*
* Masks of ceph inode work.
@@ -876,6 +882,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -931,7 +939,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
@@ -939,6 +947,7 @@ extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
struct ceph_snapid_map *sm);
extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+void ceph_umount_begin(struct super_block *sb);
/*
@@ -1027,9 +1036,20 @@ extern int ceph_setattr(struct user_namespace *mnt_userns,
extern int ceph_getattr(struct user_namespace *mnt_userns,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
+void ceph_inode_shutdown(struct inode *inode);
+
+static inline bool ceph_inode_is_shutdown(struct inode *inode)
+{
+ unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ int state = READ_ONCE(fsc->mount_state);
+
+ return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
+}
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
@@ -1194,10 +1214,12 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
+extern const struct netfs_request_ops ceph_netfs_ops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
/* file.c */
extern const struct file_operations ceph_file_fops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 159a1ffa4f4b..afec84088471 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
+ struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
+
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
@@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
+ return err;
}
-
+handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
@@ -1311,7 +1320,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
int err;
err = security_dentry_init_security(dentry, mode, &dentry->d_name,
- &as_ctx->sec_ctx,
+ &name, &as_ctx->sec_ctx,
&as_ctx->sec_ctxlen);
if (err < 0) {
WARN_ON_ONCE(err != -EOPNOTSUPP);
@@ -1335,7 +1344,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
* It only supports single security module and only selinux has
* dentry_init_security hook.
*/
- name = XATTR_NAME_SELINUX;
name_len = strlen(name);
err = ceph_pagelist_reserve(pagelist,
4 * 2 + name_len + as_ctx->sec_ctxlen);
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 87fcacdf3de7..cc8fdcb35b71 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -25,7 +25,7 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o
cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
-cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o
cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
deleted file mode 100644
index 8be57aaedab6..000000000000
--- a/fs/cifs/cache.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1
-/*
- * CIFS filesystem cache index structure definitions
- *
- * Copyright (c) 2010 Novell, Inc.
- * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
- *
- */
-#include "fscache.h"
-#include "cifs_debug.h"
-
-/*
- * CIFS filesystem definition for FS-Cache
- */
-struct fscache_netfs cifs_fscache_netfs = {
- .name = "cifs",
- .version = 0,
-};
-
-/*
- * Register CIFS for caching with FS-Cache
- */
-int cifs_fscache_register(void)
-{
- return fscache_register_netfs(&cifs_fscache_netfs);
-}
-
-/*
- * Unregister CIFS for caching
- */
-void cifs_fscache_unregister(void)
-{
- fscache_unregister_netfs(&cifs_fscache_netfs);
-}
-
-/*
- * Server object for FS-Cache
- */
-const struct fscache_cookie_def cifs_fscache_server_index_def = {
- .name = "CIFS.server",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-static enum
-fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
- const void *data,
- uint16_t datalen,
- loff_t object_size)
-{
- struct cifs_fscache_super_auxdata auxdata;
- const struct cifs_tcon *tcon = cookie_netfs_data;
-
- if (datalen != sizeof(auxdata))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.resource_id = tcon->resource_id;
- auxdata.vol_create_time = tcon->vol_create_time;
- auxdata.vol_serial_number = tcon->vol_serial_number;
-
- if (memcmp(data, &auxdata, datalen) != 0)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- return FSCACHE_CHECKAUX_OKAY;
-}
-
-/*
- * Superblock object for FS-Cache
- */
-const struct fscache_cookie_def cifs_fscache_super_index_def = {
- .name = "CIFS.super",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
- .check_aux = cifs_fscache_super_check_aux,
-};
-
-static enum
-fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
- const void *data,
- uint16_t datalen,
- loff_t object_size)
-{
- struct cifs_fscache_inode_auxdata auxdata;
- struct cifsInodeInfo *cifsi = cookie_netfs_data;
-
- if (datalen != sizeof(auxdata))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
- auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
- auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
-
- if (memcmp(data, &auxdata, datalen) != 0)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- return FSCACHE_CHECKAUX_OKAY;
-}
-
-const struct fscache_cookie_def cifs_fscache_inode_object_def = {
- .name = "CIFS.uniqueid",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = cifs_fscache_inode_check_aux,
-};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index de2c12bcfa4b..9d334816eac0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -94,7 +94,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
- tcon->tidStatus);
+ tcon->status);
if (dev_type == FILE_DEVICE_DISK)
seq_puts(m, " type: DISK ");
else if (dev_type == FILE_DEVICE_CD_ROM)
@@ -271,7 +271,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- if (server->is_channel)
+ /* channel info will be printed as a part of sessions below */
+ if (CIFS_SERVER_IS_CHAN(server))
continue;
c++;
@@ -358,6 +359,8 @@ skip_rdma:
seq_printf(m, " signed");
if (server->posix_ext_supported)
seq_printf(m, " posix");
+ if (server->nosharesock)
+ seq_printf(m, " nosharesock");
if (server->rdma)
seq_printf(m, "\nRDMA ");
@@ -412,12 +415,20 @@ skip_rdma:
from_kuid(&init_user_ns, ses->linux_uid),
from_kuid(&init_user_ns, ses->cred_uid));
+ spin_lock(&ses->chan_lock);
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0))
+ seq_puts(m, "\tPrimary channel: DISCONNECTED ");
+
if (ses->chan_count > 1) {
seq_printf(m, "\n\n\tExtra Channels: %zu ",
ses->chan_count-1);
- for (j = 1; j < ses->chan_count; j++)
+ for (j = 1; j < ses->chan_count; j++) {
cifs_dump_channel(m, j, &ses->chans[j]);
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, j))
+ seq_puts(m, "\tDISCONNECTED ");
+ }
}
+ spin_unlock(&ses->chan_lock);
seq_puts(m, "\n\n\tShares: ");
j = 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 007427ba75e5..b0864da9ef43 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -307,12 +307,8 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
struct cifs_sb_info *cifs_sb;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
void *page;
- char *full_path, *root_path;
- unsigned int xid;
- int rc;
+ char *full_path;
struct vfsmount *mnt;
cifs_dbg(FYI, "in %s\n", __func__);
@@ -324,8 +320,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
* the double backslashes usually used in the UNC. This function
* gives us the latter, so we must adjust the result.
*/
- mnt = ERR_PTR(-ENOMEM);
-
cifs_sb = CIFS_SB(mntpt->d_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
mnt = ERR_PTR(-EREMOTE);
@@ -341,60 +335,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
}
convert_delimiter(full_path, '\\');
-
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
- if (!cifs_sb_master_tlink(cifs_sb)) {
- cifs_dbg(FYI, "%s: master tlink is NULL\n", __func__);
- goto free_full_path;
- }
-
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (!tcon) {
- cifs_dbg(FYI, "%s: master tcon is NULL\n", __func__);
- goto free_full_path;
- }
-
- root_path = kstrdup(tcon->treeName, GFP_KERNEL);
- if (!root_path) {
- mnt = ERR_PTR(-ENOMEM);
- goto free_full_path;
- }
- cifs_dbg(FYI, "%s: root path: %s\n", __func__, root_path);
-
- ses = tcon->ses;
- xid = get_xid();
-
- /*
- * If DFS root has been expired, then unconditionally fetch it again to
- * refresh DFS referral cache.
- */
- rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
- root_path + 1, NULL, NULL);
- if (!rc) {
- rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), full_path + 1,
- NULL, NULL);
- }
-
- free_xid(xid);
-
- if (rc) {
- mnt = ERR_PTR(rc);
- goto free_root_path;
- }
- /*
- * OK - we were able to get and cache a referral for @full_path.
- *
- * Now, pass it down to cifs_mount() and it will retry every available
- * node server in case of failures - no need to do it here.
- */
mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path);
- cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__,
- full_path + 1, mnt);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__, full_path + 1, mnt);
-free_root_path:
- kfree(root_path);
free_full_path:
free_dentry_path(page);
cdda_exit:
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index f97407520ea1..013a4bd65280 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -61,11 +61,6 @@ struct cifs_sb_info {
/* only used when CIFS_MOUNT_USE_PREFIX_PATH is set */
char *prepath;
- /*
- * Canonical DFS path initially provided by the mount call. We might connect to something
- * different via DFS but we want to keep it to do failover properly.
- */
- char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
/* randomly generated 128-bit number for indexing dfs mount groups in referral cache */
uuid_t dfs_mount_id;
/*
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 353bd0dd7026..342717bf1dc2 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,9 +84,9 @@ struct key_type cifs_spnego_key_type = {
/* get a key struct with a SPNEGO security blob, suitable for session setup */
struct key *
-cifs_get_spnego_key(struct cifs_ses *sesInfo)
+cifs_get_spnego_key(struct cifs_ses *sesInfo,
+ struct TCP_Server_Info *server)
{
- struct TCP_Server_Info *server = cifs_ses_server(sesInfo);
struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
char *description, *dp;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index e6a0451877d4..7f102ffeb675 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -29,7 +29,8 @@ struct cifs_spnego_msg {
#ifdef __KERNEL__
extern struct key_type cifs_spnego_key_type;
-extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo,
+ struct TCP_Server_Info *server);
#endif /* KERNEL */
#endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 12bde7bfda86..180c234c2f46 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -393,26 +393,14 @@ static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg)
static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state)
{
- int i;
-
switch (state) {
case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name);
- for (i = 0; i < swnreg->tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_AVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name);
- for (i = 0; i < swnreg->tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_UNKNOWN:
cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name);
@@ -510,10 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
goto unlock;
}
- spin_lock(&GlobalMid_Lock);
- if (tcon->ses->server->tcpStatus != CifsExiting)
- tcon->ses->server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
mutex_unlock(&tcon->ses->server->srv_mutex);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index ee3aab3dd4ac..bf861fef2f0c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base,
pnntace = (struct cifs_ace *) (nacl_base + nsize);
nsize += setup_special_mode_ACE(pnntace, nmode);
num_aces++;
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += setup_authusers_ACE(pnntace);
+ num_aces++;
goto set_size;
}
@@ -1297,7 +1300,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
if (uid_valid(uid)) { /* chown */
uid_t id;
- nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!nowner_sid_ptr) {
rc = -ENOMEM;
@@ -1326,7 +1329,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
}
if (gid_valid(gid)) { /* chgrp */
gid_t id;
- ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr) {
rc = -ENOMEM;
@@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
nsecdesclen = secdesclen;
if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
if (mode_from_sid)
- nsecdesclen += sizeof(struct cifs_ace);
+ nsecdesclen += 2 * sizeof(struct cifs_ace);
else /* cifsacl */
nsecdesclen += 5 * sizeof(struct cifs_ace);
} else { /* chown */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d118282071b3..0912d8bbbac1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -141,9 +141,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
if ((cifs_pdu == NULL) || (server == NULL))
return -EINVAL;
+ spin_lock(&cifs_tcp_ses_lock);
if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
- server->tcpStatus == CifsNeedNegotiate)
+ server->tcpStatus == CifsNeedNegotiate) {
+ spin_unlock(&cifs_tcp_ses_lock);
return rc;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
if (!server->session_estab) {
memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9fa930dfd78d..2b1a1c029c75 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -26,6 +26,7 @@
#include <linux/random.h>
#include <linux/uuid.h>
#include <linux/xattr.h>
+#include <uapi/linux/magic.h>
#include <net/ipv6.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -38,7 +39,6 @@
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
-#include "smb2pdu.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
@@ -203,13 +203,16 @@ cifs_read_super(struct super_block *sb)
sb->s_time_max = ts.tv_sec;
}
- sb->s_magic = CIFS_MAGIC_NUMBER;
+ sb->s_magic = CIFS_SUPER_MAGIC;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
rc = super_setup_bdi(sb);
if (rc)
goto out_no_root;
/* tune readahead according to rsize if readahead size not set on mount */
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
if (cifs_sb->ctx->rasize)
sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
else
@@ -254,26 +257,33 @@ static void cifs_kill_sb(struct super_block *sb)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct cached_fid *cfid;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
* before we kill the sb.
*/
if (cifs_sb->root) {
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
+ if (IS_ERR(tcon))
+ continue;
+ cfid = &tcon->crfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ }
+
+ /* finally release root dentry */
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon) {
- cfid = &tcon->crfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
-
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- mutex_unlock(&cfid->fid_mutex);
- }
kill_anon_super(sb);
cifs_umount(cifs_sb);
@@ -354,7 +364,7 @@ static struct inode *
cifs_alloc_inode(struct super_block *sb)
{
struct cifsInodeInfo *cifs_inode;
- cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
+ cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
if (!cifs_inode)
return NULL;
cifs_inode->cifsAttrs = 0x20; /* default */
@@ -397,6 +407,9 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
+ if (inode->i_state & I_PINNING_FSCACHE_WB)
+ cifs_fscache_unuse_inode_cookie(inode, true);
+ cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
}
@@ -688,14 +701,14 @@ static void cifs_umount_begin(struct super_block *sb)
tcon = cifs_sb_master_tcon(cifs_sb);
spin_lock(&cifs_tcp_ses_lock);
- if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+ if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to force umount this and woken up
all waiting network requests, nothing to do */
spin_unlock(&cifs_tcp_ses_lock);
return;
} else if (tcon->tc_count == 1)
- tcon->tidStatus = CifsExiting;
+ tcon->status = TID_EXITING;
spin_unlock(&cifs_tcp_ses_lock);
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
@@ -721,6 +734,12 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
}
#endif
+static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
+ return 0;
+}
+
static int cifs_drop_inode(struct inode *inode)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -733,6 +752,7 @@ static int cifs_drop_inode(struct inode *inode)
static const struct super_operations cifs_super_ops = {
.statfs = cifs_statfs,
.alloc_inode = cifs_alloc_inode,
+ .write_inode = cifs_write_inode,
.free_inode = cifs_free_inode,
.drop_inode = cifs_drop_inode,
.evict_inode = cifs_evict_inode,
@@ -774,7 +794,7 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
sep = CIFS_DIR_SEP(cifs_sb);
dentry = dget(sb->s_root);
- p = s = full_path;
+ s = full_path;
do {
struct inode *dir = d_inode(dentry);
@@ -909,6 +929,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
out_super:
deactivate_locked_super(sb);
+ return root;
out:
if (cifs_sb) {
kfree(cifs_sb->prepath);
@@ -925,7 +946,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
@@ -1625,13 +1646,9 @@ init_cifs(void)
goto out_destroy_cifsoplockd_wq;
}
- rc = cifs_fscache_register();
- if (rc)
- goto out_destroy_deferredclose_wq;
-
rc = cifs_init_inodecache();
if (rc)
- goto out_unreg_fscache;
+ goto out_destroy_deferredclose_wq;
rc = cifs_init_mids();
if (rc)
@@ -1693,8 +1710,6 @@ out_destroy_mids:
cifs_destroy_mids();
out_destroy_inodecache:
cifs_destroy_inodecache();
-out_unreg_fscache:
- cifs_fscache_unregister();
out_destroy_deferredclose_wq:
destroy_workqueue(deferredclose_wq);
out_destroy_cifsoplockd_wq:
@@ -1730,7 +1745,6 @@ exit_cifs(void)
cifs_destroy_request_bufs();
cifs_destroy_mids();
cifs_destroy_inodecache();
- cifs_fscache_unregister();
destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b50da1901ebd..c0542bdcd06b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -152,5 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.33"
+#define SMB3_PRODUCT_BUILD 35
+#define CIFS_VERSION "2.36"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e916470468ea..8de977c359b1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -15,15 +15,16 @@
#include <linux/slab.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
+#include <linux/utsname.h>
+#include <linux/netfs.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
#include <crypto/internal/hash.h>
#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
+#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
-#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
-
#define SMB_PATH_MAX 260
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -74,7 +75,8 @@
#define SMB_ECHO_INTERVAL_MAX 600
#define SMB_ECHO_INTERVAL_DEFAULT 60
-/* dns resolution interval in seconds */
+/* dns resolution intervals in seconds */
+#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
/* maximum number of PDUs in one compound */
@@ -98,6 +100,8 @@
#define XATTR_DOS_ATTRIB "user.DOSATTRIB"
#endif
+#define CIFS_MAX_WORKSTATION_LEN (__NEW_UTS_LEN + 1) /* reasonable max for client */
+
/*
* CIFS vfs client Status information (based on what we know.)
*/
@@ -108,7 +112,22 @@ enum statusEnum {
CifsGood,
CifsExiting,
CifsNeedReconnect,
- CifsNeedNegotiate
+ CifsNeedNegotiate,
+ CifsInNegotiate,
+ CifsNeedSessSetup,
+ CifsInSessSetup,
+};
+
+/* associated with each tree connection to the server */
+enum tid_status_enum {
+ TID_NEW = 0,
+ TID_GOOD,
+ TID_EXITING,
+ TID_NEED_RECON,
+ TID_NEED_TCON,
+ TID_IN_TCON,
+ TID_NEED_FILES_INVALIDATE, /* currently unused */
+ TID_IN_FILES_INVALIDATE
};
enum securityEnum {
@@ -258,13 +277,16 @@ struct smb_version_operations {
/* check if we need to negotiate */
bool (*need_neg)(struct TCP_Server_Info *);
/* negotiate to the server */
- int (*negotiate)(const unsigned int, struct cifs_ses *);
+ int (*negotiate)(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
/* set negotiated write size */
unsigned int (*negotiate_wsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx);
/* set negotiated read size */
unsigned int (*negotiate_rsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx);
/* setup smb sessionn */
int (*sess_setup)(const unsigned int, struct cifs_ses *,
+ struct TCP_Server_Info *server,
const struct nls_table *);
/* close smb session */
int (*logoff)(const unsigned int, struct cifs_ses *);
@@ -409,7 +431,8 @@ struct smb_version_operations {
void (*set_lease_key)(struct inode *, struct cifs_fid *);
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *);
- int (*generate_signingkey)(struct cifs_ses *);
+ int (*generate_signingkey)(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
@@ -577,7 +600,7 @@ struct TCP_Server_Info {
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
struct smb_version_values *vals;
- /* updates to tcpStatus protected by GlobalMid_Lock */
+ /* updates to tcpStatus protected by cifs_tcp_ses_lock */
enum statusEnum tcpStatus; /* what we think the status is */
char *hostname; /* hostname portion of UNC string */
struct socket *ssocket;
@@ -591,6 +614,7 @@ struct TCP_Server_Info {
struct list_head pending_mid_q;
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
+ bool nosharesock;
bool tcp_nodelay;
unsigned int credits; /* send no more requests at once */
unsigned int max_credits; /* can override large 32000 default at mnt */
@@ -653,9 +677,6 @@ struct TCP_Server_Info {
unsigned int total_read; /* total amount of data read in this pass */
atomic_t in_send; /* requests trying to send */
atomic_t num_waiters; /* blocked waiting to get in sendrecv */
-#ifdef CONFIG_CIFS_FSCACHE
- struct fscache_cookie *fscache; /* client index cache cookie */
-#endif
#ifdef CONFIG_CIFS_STATS2
atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */
atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */
@@ -684,13 +705,34 @@ struct TCP_Server_Info {
*/
int nr_targets;
bool noblockcnt; /* use non-blocking connect() */
- bool is_channel; /* if a session channel */
+
+ /*
+ * If this is a session channel,
+ * primary_server holds the ref-counted
+ * pointer to primary channel connection for the session.
+ */
+#define CIFS_SERVER_IS_CHAN(server) (!!(server)->primary_server)
+ struct TCP_Server_Info *primary_server;
+
#ifdef CONFIG_CIFS_SWN_UPCALL
bool use_swn_dstaddr;
struct sockaddr_storage swn_dstaddr;
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
bool is_dfs_conn; /* if a dfs connection */
+ struct mutex refpath_lock; /* protects leaf_fullpath */
+ /*
+ * Canonical DFS full paths that were used to chase referrals in mount and reconnect.
+ *
+ * origin_fullpath: first or original referral path
+ * leaf_fullpath: last referral path (might be changed due to nested links in reconnect)
+ *
+ * current_fullpath: pointer to either origin_fullpath or leaf_fullpath
+ * NOTE: cannot be accessed outside cifs_reconnect() and smb2_reconnect()
+ *
+ * format: \\HOST\SHARE\[OPTIONAL PATH]
+ */
+ char *origin_fullpath, *leaf_fullpath, *current_fullpath;
#endif
};
@@ -776,7 +818,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
static inline void
revert_current_mid_from_hdr(struct TCP_Server_Info *server,
- const struct smb2_sync_hdr *shdr)
+ const struct smb2_hdr *shdr)
{
unsigned int num = le16_to_cpu(shdr->CreditCharge);
@@ -819,13 +861,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
-/*
- * The default wsize is 1M. find_get_pages seems to return a maximum of 256
- * pages in a single call. With PAGE_SIZE == 4k, this means we can fill
- * a single wsize request with a single call.
- */
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
-#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
/*
* Windows only supports a max of 60kb reads and 65535 byte writes. Default to
@@ -888,12 +924,13 @@ struct cifs_chan {
*/
struct cifs_ses {
struct list_head smb_ses_list;
+ struct list_head rlist; /* reconnect list */
struct list_head tcon_list;
struct cifs_tcon *tcon_ipc;
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
- enum statusEnum status; /* updates protected by GlobalMid_Lock */
+ enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */
unsigned overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -907,21 +944,18 @@ struct cifs_ses {
and after mount option parsing we fill it */
char *domainName;
char *password;
+ char *workstation_name;
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
- bool need_reconnect:1; /* connection reset, uid now invalid */
bool domainAuto:1;
- bool binding:1; /* are we binding the session? */
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
- __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
-
/*
* Network interfaces available on the server this session is
* connected to.
@@ -932,48 +966,42 @@ struct cifs_ses {
* iface_lock should be taken when accessing any of these fields
*/
spinlock_t iface_lock;
+ /* ========= begin: protected by iface_lock ======== */
struct cifs_server_iface *iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
+ /* ========= end: protected by iface_lock ======== */
+ spinlock_t chan_lock;
+ /* ========= begin: protected by chan_lock ======== */
#define CIFS_MAX_CHANNELS 16
+#define CIFS_ALL_CHANNELS_SET(ses) \
+ ((1UL << (ses)->chan_count) - 1)
+#define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \
+ ((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses))
+#define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \
+ ((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses))
+#define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \
+ test_bit((index), &(ses)->chans_need_reconnect)
+
struct cifs_chan chans[CIFS_MAX_CHANNELS];
- struct cifs_chan *binding_chan;
size_t chan_count;
size_t chan_max;
atomic_t chan_seq; /* round robin state */
-};
-/*
- * When binding a new channel, we need to access the channel which isn't fully
- * established yet.
- */
-
-static inline
-struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses)
-{
- if (ses->binding)
- return ses->binding_chan;
- else
- return NULL;
-}
-
-/*
- * Returns the server pointer of the session. When binding a new
- * channel this returns the last channel which isn't fully established
- * yet.
- *
- * This function should be use for negprot/sess.setup codepaths. For
- * the other requests see cifs_pick_channel().
- */
-static inline
-struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses)
-{
- if (ses->binding)
- return ses->binding_chan->server;
- else
- return ses->server;
-}
+ /*
+ * chans_need_reconnect is a bitmap indicating which of the channels
+ * under this smb session needs to be reconnected.
+ * If not multichannel session, only one bit will be used.
+ *
+ * We will ask for sess and tcon reconnection only if all the
+ * channels are marked for needing reconnection. This will
+ * enable the sessions on top to continue to live till any
+ * of the channels below are active.
+ */
+ unsigned long chans_need_reconnect;
+ /* ========= end: protected by chan_lock ======== */
+};
static inline bool
cap_unix(struct cifs_ses *ses)
@@ -1013,7 +1041,7 @@ struct cifs_tcon {
char *password; /* for share-level security */
__u32 tid; /* The 4 byte tree id */
__u16 Flags; /* optional support bits */
- enum statusEnum tidStatus;
+ enum tid_status_enum status;
atomic_t num_smbs_sent;
union {
struct {
@@ -1084,13 +1112,12 @@ struct cifs_tcon {
__u32 max_bytes_copy;
#ifdef CONFIG_CIFS_FSCACHE
u64 resource_id; /* server resource id */
- struct fscache_cookie *fscache; /* cookie for share */
+ struct fscache_volume *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fid crfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
- char *dfs_path; /* canonical DFS path */
struct list_head ulist; /* cache update list */
#endif
};
@@ -1378,6 +1405,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
*/
struct cifsInodeInfo {
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
/*
@@ -1408,10 +1440,6 @@ struct cifsInodeInfo {
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
-#ifdef CONFIG_CIFS_FSCACHE
- struct fscache_cookie *fscache;
-#endif
- struct inode vfs_inode;
struct list_head deferred_closes; /* list of deferred closes */
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
@@ -1941,4 +1969,14 @@ static inline bool is_tcon_dfs(struct cifs_tcon *tcon)
tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT);
}
+static inline bool cifs_is_referral_server(struct cifs_tcon *tcon,
+ const struct dfs_info3_param *ref)
+{
+ /*
+ * Check if all targets are capable of handling DFS referrals as per
+ * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL.
+ */
+ return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER));
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d2ff438fd31f..aeba371c4c70 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -123,18 +123,6 @@
*/
#define CIFS_SESS_KEY_SIZE (16)
-/*
- * Size of the smb3 signing key
- */
-#define SMB3_SIGN_KEY_SIZE (16)
-
-/*
- * Size of the smb3 encryption/decryption key storage.
- * This size is big enough to store any cipher key types.
- */
-#define SMB3_ENC_DEC_KEY_SIZE (32)
-
-#define CIFS_CLIENT_CHALLENGE_SIZE (8)
#define CIFS_SERVER_CHALLENGE_SIZE (8)
#define CIFS_HMAC_MD5_HASH_SIZE (16)
#define CIFS_CPHTXT_SIZE (16)
@@ -1658,7 +1646,7 @@ struct smb_t2_rsp {
#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105
#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106
#define SMB_FIND_FILE_UNIX 0x202
-#define SMB_FIND_FILE_POSIX_INFO 0x064
+/* #define SMB_FIND_FILE_POSIX_INFO 0x064 */
typedef struct smb_com_transaction2_qpi_req {
struct smb_hdr hdr; /* wct = 14+ */
@@ -2560,7 +2548,7 @@ typedef struct {
__le32 EaSize; /* length of the xattrs */
__u8 ShortNameLength;
__u8 Reserved;
- __u8 ShortName[12];
+ __u8 ShortName[24];
char FileName[1];
} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d0f85b666662..0df3b24a0bf4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -131,7 +131,14 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *in_buf ,
struct smb_hdr *out_buf,
int *bytes_returned);
-extern int cifs_reconnect(struct TCP_Server_Info *server);
+void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels);
+void
+cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
+ bool mark_smb_session);
+extern int cifs_reconnect(struct TCP_Server_Info *server,
+ bool mark_smb_session);
extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
extern bool backup_cred(struct cifs_sb_info *);
@@ -164,6 +171,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
enum securityEnum requested);
extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct nls_table *nls_cp);
extern struct timespec64 cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec64);
@@ -269,8 +277,9 @@ extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
const char *path);
-
-extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx);
+extern struct TCP_Server_Info *
+cifs_get_tcp_session(struct smb3_fs_context *ctx,
+ struct TCP_Server_Info *primary_server);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
extern void cifs_put_tcon(struct cifs_tcon *tcon);
@@ -292,11 +301,15 @@ extern int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nlsc);
extern int cifs_negotiate_protocol(const unsigned int xid,
- struct cifs_ses *ses);
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
struct nls_table *nls_info);
extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
-extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
+extern int CIFSSMBNegotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
const char *tree, struct cifs_tcon *tcon,
@@ -503,8 +516,10 @@ extern int cifs_verify_signature(struct smb_rqst *rqst,
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server);
extern int calc_seckey(struct cifs_ses *);
-extern int generate_smb30signingkey(struct cifs_ses *);
-extern int generate_smb311signingkey(struct cifs_ses *);
+extern int generate_smb30signingkey(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+extern int generate_smb311signingkey(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
extern int CIFSSMBCopy(unsigned int xid,
struct cifs_tcon *source_tcon,
@@ -598,6 +613,20 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
bool is_server_using_iface(struct TCP_Server_Info *server,
struct cifs_server_iface *iface);
bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
+void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
+
+unsigned int
+cifs_ses_get_chan_index(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
+cifs_chan_set_need_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
+cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+bool
+cifs_chan_needs_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
@@ -607,7 +636,7 @@ int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov,
struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server);
void cifs_put_tcp_super(struct super_block *sb);
-int update_super_prepath(struct cifs_tcon *tcon, char *prefix);
+int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix);
char *extract_hostname(const char *unc);
char *extract_sharename(const char *unc);
@@ -624,6 +653,11 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
int match_target_ip(struct TCP_Server_Info *server,
const char *share, size_t share_len,
bool *result);
+
+int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *dfs_link_path);
#endif
static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
@@ -634,4 +668,7 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
return options;
}
+struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
+void cifs_put_tcon_super(struct super_block *sb);
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 243d17696f06..47e927c4ff8d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -73,6 +73,15 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
struct list_head *tmp;
struct list_head *tmp1;
+ /* only send once per connect */
+ spin_lock(&cifs_tcp_ses_lock);
+ if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
+ tcon->status = TID_IN_FILES_INVALIDATE;
+ spin_unlock(&cifs_tcp_ses_lock);
+
/* list all files open on tree connection and mark them invalid */
spin_lock(&tcon->open_file_lock);
list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
@@ -89,6 +98,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid));
mutex_unlock(&tcon->crfid.fid_mutex);
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_IN_FILES_INVALIDATE)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&cifs_tcp_ses_lock);
+
/*
* BB Add call to invalidate_inodes(sb) for all superblocks mounted
* to this tcon.
@@ -120,15 +134,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* only tree disconnect, open, and write, (and ulogoff which does not
* have tcon) are allowed as we start force umount
*/
- if (tcon->tidStatus == CifsExiting) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_EXITING) {
if (smb_command != SMB_COM_WRITE_ANDX &&
smb_command != SMB_COM_OPEN_ANDX &&
smb_command != SMB_COM_TREE_DISCONNECT) {
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb_command);
return -ENODEV;
}
}
+ spin_unlock(&cifs_tcp_ses_lock);
retries = server->nr_targets;
@@ -148,8 +165,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
/* are we still trying to reconnect? */
- if (server->tcpStatus != CifsNeedReconnect)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
break;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
if (retries && --retries)
continue;
@@ -166,31 +187,49 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
retries = server->nr_targets;
}
- if (!ses->need_reconnect && !tcon->need_reconnect)
+ spin_lock(&ses->chan_lock);
+ if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) {
+ spin_unlock(&ses->chan_lock);
return 0;
+ }
+ spin_unlock(&ses->chan_lock);
nls_codepage = load_nls_default();
/*
- * need to prevent multiple threads trying to simultaneously
- * reconnect the same SMB session
- */
- mutex_lock(&ses->session_mutex);
-
- /*
* Recheck after acquire mutex. If another thread is negotiating
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedReconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
rc = -EHOSTDOWN;
- mutex_unlock(&ses->session_mutex);
goto out;
}
+ spin_unlock(&cifs_tcp_ses_lock);
- rc = cifs_negotiate_protocol(0, ses);
- if (rc == 0 && ses->need_reconnect)
- rc = cifs_setup_session(0, ses, nls_codepage);
+ /*
+ * need to prevent multiple threads trying to simultaneously
+ * reconnect the same SMB session
+ */
+ spin_lock(&ses->chan_lock);
+ if (!cifs_chan_needs_reconnect(ses, server)) {
+ spin_unlock(&ses->chan_lock);
+
+ /* this means that we only need to tree connect */
+ if (tcon->need_reconnect)
+ goto skip_sess_setup;
+
+ rc = -EHOSTDOWN;
+ goto out;
+ }
+ spin_unlock(&ses->chan_lock);
+
+ mutex_lock(&ses->session_mutex);
+ rc = cifs_negotiate_protocol(0, ses, server);
+ if (!rc)
+ rc = cifs_setup_session(0, ses, server, nls_codepage);
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
@@ -198,6 +237,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
goto out;
}
+skip_sess_setup:
cifs_mark_open_files_invalid(tcon);
rc = cifs_tree_connect(0, tcon, nls_codepage);
mutex_unlock(&ses->session_mutex);
@@ -337,8 +377,13 @@ static int
smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
void **request_buf, void **response_buf)
{
- if (tcon->ses->need_reconnect || tcon->need_reconnect)
+ spin_lock(&tcon->ses->chan_lock);
+ if (cifs_chan_needs_reconnect(tcon->ses, tcon->ses->server) ||
+ tcon->need_reconnect) {
+ spin_unlock(&tcon->ses->chan_lock);
return -EHOSTDOWN;
+ }
+ spin_unlock(&tcon->ses->chan_lock);
return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
}
@@ -476,14 +521,15 @@ should_set_ext_sec_flag(enum securityEnum sectype)
}
int
-CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
+CIFSSMBNegotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
NEGOTIATE_REQ *pSMB;
NEGOTIATE_RSP *pSMBr;
int rc = 0;
int bytes_returned;
int i;
- struct TCP_Server_Info *server = ses->server;
u16 count;
if (!server) {
@@ -550,7 +596,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
set_credits(server, server->maxReq);
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
- /* set up max_read for readpages check */
+ /* set up max_read for readahead check */
server->max_read = server->maxBuf;
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
@@ -600,8 +646,12 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
* the tcon is no longer on the list, so no need to take lock before
* checking this.
*/
- if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
- return 0;
+ spin_lock(&tcon->ses->chan_lock);
+ if ((tcon->need_reconnect) || CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses)) {
+ spin_unlock(&tcon->ses->chan_lock);
+ return -EIO;
+ }
+ spin_unlock(&tcon->ses->chan_lock);
rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
(void **)&smb_buffer);
@@ -696,9 +746,14 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
return -EIO;
mutex_lock(&ses->session_mutex);
- if (ses->need_reconnect)
+ spin_lock(&ses->chan_lock);
+ if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) {
+ spin_unlock(&ses->chan_lock);
goto session_already_dead; /* no need to send SMBlogoff if uid
already closed due to reconnect */
+ }
+ spin_unlock(&ses->chan_lock);
+
rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
if (rc) {
mutex_unlock(&ses->session_mutex);
@@ -1401,7 +1456,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -1;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c3b94c1e4591..42e14f408856 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -61,6 +61,20 @@ extern bool disable_legacy_dialects;
/* Drop the connection to not overload the server */
#define NUM_STATUS_IO_TIMEOUT 5
+struct mount_ctx {
+ struct cifs_sb_info *cifs_sb;
+ struct smb3_fs_context *fs_ctx;
+ unsigned int xid;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct cifs_ses *root_ses;
+ uuid_t mount_id;
+ char *origin_fullpath, *leaf_fullpath;
+#endif
+};
+
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -115,7 +129,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
* To make sure we don't use the cached entry, retry 1s
* after expiry.
*/
- ttl = (expiry - now + 1);
+ ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
}
rc = !rc ? -1 : 0;
@@ -148,153 +162,118 @@ static void cifs_resolve_server(struct work_struct *work)
mutex_unlock(&server->srv_mutex);
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
-/* These functions must be called with server->srv_mutex held */
-static void reconn_set_next_dfs_target(struct TCP_Server_Info *server,
- struct cifs_sb_info *cifs_sb,
- struct dfs_cache_tgt_list *tgt_list,
- struct dfs_cache_tgt_iterator **tgt_it)
+/*
+ * Update the tcpStatus for the server.
+ * This is used to signal the cifsd thread to call cifs_reconnect
+ * ONLY cifsd thread should call cifs_reconnect. For any other
+ * thread, use this function
+ *
+ * @server: the tcp ses for which reconnect is needed
+ * @all_channels: if this needs to be done for all channels
+ */
+void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels)
{
- const char *name;
- int rc;
-
- if (!cifs_sb || !cifs_sb->origin_fullpath)
- return;
-
- if (!*tgt_it) {
- *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
- } else {
- *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
- if (!*tgt_it)
- *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
- }
-
- cifs_dbg(FYI, "%s: UNC: %s\n", __func__, cifs_sb->origin_fullpath);
-
- name = dfs_cache_get_tgt_name(*tgt_it);
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ int i;
- kfree(server->hostname);
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
- server->hostname = extract_hostname(name);
- if (IS_ERR(server->hostname)) {
- cifs_dbg(FYI,
- "%s: failed to extract hostname from target: %ld\n",
- __func__, PTR_ERR(server->hostname));
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!all_channels) {
+ pserver->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&cifs_tcp_ses_lock);
return;
}
- rc = reconn_set_ipaddr_from_hostname(server);
- if (rc) {
- cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
- __func__, rc);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->chan_lock);
+ for (i = 0; i < ses->chan_count; i++)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chan_lock);
}
+ spin_unlock(&cifs_tcp_ses_lock);
}
-static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb,
- struct dfs_cache_tgt_list *tl)
-{
- if (!cifs_sb->origin_fullpath)
- return -EOPNOTSUPP;
- return dfs_cache_noreq_find(cifs_sb->origin_fullpath + 1, NULL, tl);
-}
-#endif
-
/*
- * cifs tcp session reconnection
+ * Mark all sessions and tcons for reconnect.
+ * IMPORTANT: make sure that this gets called only from
+ * cifsd thread. For any other thread, use
+ * cifs_signal_cifsd_for_reconnect
*
- * mark tcp session as reconnecting so temporarily locked
- * mark all smb sessions as reconnecting for tcp session
- * reconnect tcp session
- * wake up waiters on reconnection? - (not needed currently)
+ * @server: the tcp ses for which reconnect is needed
+ * @server needs to be previously set to CifsNeedReconnect.
+ * @mark_smb_session: whether even sessions need to be marked
*/
-int
-cifs_reconnect(struct TCP_Server_Info *server)
+void
+cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
+ bool mark_smb_session)
{
- int rc = 0;
- struct list_head *tmp, *tmp2;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- struct mid_q_entry *mid_entry;
- struct list_head retry_list;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- struct super_block *sb = NULL;
- struct cifs_sb_info *cifs_sb = NULL;
- struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
- struct dfs_cache_tgt_iterator *tgt_it = NULL;
-#endif
- spin_lock(&GlobalMid_Lock);
- server->nr_targets = 1;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- spin_unlock(&GlobalMid_Lock);
- sb = cifs_get_tcp_super(server);
- if (IS_ERR(sb)) {
- rc = PTR_ERR(sb);
- cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
- __func__, rc);
- sb = NULL;
- } else {
- cifs_sb = CIFS_SB(sb);
- rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list);
- if (rc) {
- cifs_sb = NULL;
- if (rc != -EOPNOTSUPP) {
- cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
- __func__);
- }
- } else {
- server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list);
- }
- }
- cifs_dbg(FYI, "%s: will retry %d target(s)\n", __func__,
- server->nr_targets);
- spin_lock(&GlobalMid_Lock);
-#endif
- if (server->tcpStatus == CifsExiting) {
- /* the demux thread will exit normally
- next time through the loop */
- spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- dfs_cache_free_tgts(&tgt_list);
- cifs_put_tcp_super(sb);
-#endif
- wake_up(&server->response_q);
- return rc;
- } else
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- server->maxBuf = 0;
- server->max_read = 0;
+ /*
+ * before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they
+ * are not used until reconnected.
+ */
+ cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__);
+
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
- cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
- trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname);
- /* before reconnecting the tcp session, mark the smb session (uid)
- and the tid bad so they are not used until reconnected */
- cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n",
- __func__);
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- ses->need_reconnect = true;
- list_for_each(tmp2, &ses->tcon_list) {
- tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->chan_lock);
+ if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
+ goto next_session;
+
+ if (mark_smb_session)
+ CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses);
+ else
+ cifs_chan_set_need_reconnect(ses, server);
+
+ /* If all channels need reconnect, then tcon needs reconnect */
+ if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses))
+ goto next_session;
+
+ ses->status = CifsNeedReconnect;
+
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
tcon->need_reconnect = true;
+ tcon->status = TID_NEED_RECON;
}
if (ses->tcon_ipc)
ses->tcon_ipc->need_reconnect = true;
+
+next_session:
+ spin_unlock(&ses->chan_lock);
}
spin_unlock(&cifs_tcp_ses_lock);
+}
+
+static void
+cifs_abort_connection(struct TCP_Server_Info *server)
+{
+ struct mid_q_entry *mid, *nmid;
+ struct list_head retry_list;
+
+ server->maxBuf = 0;
+ server->max_read = 0;
/* do not want to be sending data on a socket we are freeing */
cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
mutex_lock(&server->srv_mutex);
if (server->ssocket) {
- cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
- server->ssocket->state, server->ssocket->flags);
+ cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state,
+ server->ssocket->flags);
kernel_sock_shutdown(server->ssocket, SHUT_WR);
- cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
- server->ssocket->state, server->ssocket->flags);
+ cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n", server->ssocket->state,
+ server->ssocket->flags);
sock_release(server->ssocket);
server->ssocket = NULL;
}
@@ -309,23 +288,21 @@ cifs_reconnect(struct TCP_Server_Info *server)
INIT_LIST_HEAD(&retry_list);
cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
spin_lock(&GlobalMid_Lock);
- list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
- mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- kref_get(&mid_entry->refcount);
- if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
- mid_entry->mid_state = MID_RETRY_NEEDED;
- list_move(&mid_entry->qhead, &retry_list);
- mid_entry->mid_flags |= MID_DELETED;
+ list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
+ kref_get(&mid->refcount);
+ if (mid->mid_state == MID_REQUEST_SUBMITTED)
+ mid->mid_state = MID_RETRY_NEEDED;
+ list_move(&mid->qhead, &retry_list);
+ mid->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
- list_for_each_safe(tmp, tmp2, &retry_list) {
- mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- list_del_init(&mid_entry->qhead);
- mid_entry->callback(mid_entry);
- cifs_mid_q_entry_release(mid_entry);
+ list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
+ list_del_init(&mid->qhead);
+ mid->callback(mid);
+ cifs_mid_q_entry_release(mid);
}
if (cifs_rdma_enabled(server)) {
@@ -333,38 +310,61 @@ cifs_reconnect(struct TCP_Server_Info *server)
smbd_destroy(server);
mutex_unlock(&server->srv_mutex);
}
+}
- do {
- try_to_freeze();
+static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets)
+{
+ spin_lock(&cifs_tcp_ses_lock);
+ server->nr_targets = num_targets;
+ if (server->tcpStatus == CifsExiting) {
+ /* the demux thread will exit normally next time through the loop */
+ spin_unlock(&cifs_tcp_ses_lock);
+ wake_up(&server->response_q);
+ return false;
+ }
- mutex_lock(&server->srv_mutex);
+ cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
+ trace_smb3_reconnect(server->CurrentMid, server->conn_id,
+ server->hostname);
+ server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
+}
- if (!cifs_swn_set_server_dstaddr(server)) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (cifs_sb && cifs_sb->origin_fullpath)
- /*
- * Set up next DFS target server (if any) for reconnect. If DFS
- * feature is disabled, then we will retry last server we
- * connected to before.
- */
- reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it);
- else {
-#endif
- /*
- * Resolve the hostname again to make sure that IP address is up-to-date.
- */
- rc = reconn_set_ipaddr_from_hostname(server);
- if (rc) {
- cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
- __func__, rc);
- }
+/*
+ * cifs tcp session reconnection
+ *
+ * mark tcp session as reconnecting so temporarily locked
+ * mark all smb sessions as reconnecting for tcp session
+ * reconnect tcp session
+ * wake up waiters on reconnection? - (not needed currently)
+ *
+ * if mark_smb_session is passed as true, unconditionally mark
+ * the smb session (and tcon) for reconnect as well. This value
+ * doesn't really matter for non-multichannel scenario.
+ *
+ */
+static int __cifs_reconnect(struct TCP_Server_Info *server,
+ bool mark_smb_session)
+{
+ int rc = 0;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- }
-#endif
+ if (!cifs_tcp_ses_needs_reconnect(server, 1))
+ return 0;
+ cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
+ cifs_abort_connection(server);
+
+ do {
+ try_to_freeze();
+ mutex_lock(&server->srv_mutex);
+
+ if (!cifs_swn_set_server_dstaddr(server)) {
+ /* resolve the hostname again to make sure that IP address is up-to-date */
+ rc = reconn_set_ipaddr_from_hostname(server);
+ cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc);
}
if (cifs_rdma_enabled(server))
@@ -372,41 +372,190 @@ cifs_reconnect(struct TCP_Server_Info *server)
else
rc = generic_ip_connect(server);
if (rc) {
- cifs_dbg(FYI, "reconnect error %d\n", rc);
mutex_unlock(&server->srv_mutex);
+ cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
set_credits(server, 1);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_swn_reset_server_dstaddr(server);
mutex_unlock(&server->srv_mutex);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
}
} while (server->tcpStatus == CifsNeedReconnect);
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsNeedNegotiate)
+ mod_delayed_work(cifsiod_wq, &server->echo, 0);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ wake_up(&server->response_q);
+ return rc;
+}
+
#ifdef CONFIG_CIFS_DFS_UPCALL
- if (tgt_it) {
- rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1,
- tgt_it);
- if (rc) {
- cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n",
- __func__, rc);
+static int __reconnect_target_unlocked(struct TCP_Server_Info *server, const char *target)
+{
+ int rc;
+ char *hostname;
+
+ if (!cifs_swn_set_server_dstaddr(server)) {
+ if (server->hostname != target) {
+ hostname = extract_hostname(target);
+ if (!IS_ERR(hostname)) {
+ kfree(server->hostname);
+ server->hostname = hostname;
+ } else {
+ cifs_dbg(FYI, "%s: couldn't extract hostname or address from dfs target: %ld\n",
+ __func__, PTR_ERR(hostname));
+ cifs_dbg(FYI, "%s: default to last target server: %s\n", __func__,
+ server->hostname);
+ }
}
- dfs_cache_free_tgts(&tgt_list);
+ /* resolve the hostname again to make sure that IP address is up-to-date. */
+ rc = reconn_set_ipaddr_from_hostname(server);
+ cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc);
}
+ /* Reconnect the socket */
+ if (cifs_rdma_enabled(server))
+ rc = smbd_reconnect(server);
+ else
+ rc = generic_ip_connect(server);
- cifs_put_tcp_super(sb);
-#endif
+ return rc;
+}
+
+static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_cache_tgt_list *tl,
+ struct dfs_cache_tgt_iterator **target_hint)
+{
+ int rc;
+ struct dfs_cache_tgt_iterator *tit;
+
+ *target_hint = NULL;
+
+ /* If dfs target list is empty, then reconnect to last server */
+ tit = dfs_cache_get_tgt_iterator(tl);
+ if (!tit)
+ return __reconnect_target_unlocked(server, server->hostname);
+
+ /* Otherwise, try every dfs target in @tl */
+ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
+ rc = __reconnect_target_unlocked(server, dfs_cache_get_tgt_name(tit));
+ if (!rc) {
+ *target_hint = tit;
+ break;
+ }
+ }
+ return rc;
+}
+
+static int reconnect_dfs_server(struct TCP_Server_Info *server)
+{
+ int rc = 0;
+ const char *refpath = server->current_fullpath + 1;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ struct dfs_cache_tgt_iterator *target_hint = NULL;
+ int num_targets = 0;
+
+ /*
+ * Determine the number of dfs targets the referral path in @cifs_sb resolves to.
+ *
+ * smb2_reconnect() needs to know how long it should wait based upon the number of dfs
+ * targets (server->nr_targets). It's also possible that the cached referral was cleared
+ * through /proc/fs/cifs/dfscache or the target list is empty due to server settings after
+ * refreshing the referral, so, in this case, default it to 1.
+ */
+ if (!dfs_cache_noreq_find(refpath, NULL, &tl))
+ num_targets = dfs_cache_get_nr_tgts(&tl);
+ if (!num_targets)
+ num_targets = 1;
+
+ if (!cifs_tcp_ses_needs_reconnect(server, num_targets))
+ return 0;
+
+ /*
+ * Unconditionally mark all sessions & tcons for reconnect as we might be connecting to a
+ * different server or share during failover. It could be improved by adding some logic to
+ * only do that in case it connects to a different server or share, though.
+ */
+ cifs_mark_tcp_ses_conns_for_reconnect(server, true);
+
+ cifs_abort_connection(server);
+
+ do {
+ try_to_freeze();
+ mutex_lock(&server->srv_mutex);
+
+ rc = reconnect_target_unlocked(server, &tl, &target_hint);
+ if (rc) {
+ /* Failed to reconnect socket */
+ mutex_unlock(&server->srv_mutex);
+ cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
+ msleep(3000);
+ continue;
+ }
+ /*
+ * Socket was created. Update tcp session status to CifsNeedNegotiate so that a
+ * process waiting for reconnect will know it needs to re-establish session and tcon
+ * through the reconnected target server.
+ */
+ atomic_inc(&tcpSesReconnectCount);
+ set_credits(server, 1);
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus != CifsExiting)
+ server->tcpStatus = CifsNeedNegotiate;
+ spin_unlock(&cifs_tcp_ses_lock);
+ cifs_swn_reset_server_dstaddr(server);
+ mutex_unlock(&server->srv_mutex);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ } while (server->tcpStatus == CifsNeedReconnect);
+
+ if (target_hint)
+ dfs_cache_noreq_update_tgthint(refpath, target_hint);
+
+ dfs_cache_free_tgts(&tl);
+
+ /* Need to set up echo worker again once connection has been established */
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
+ spin_unlock(&cifs_tcp_ses_lock);
+
wake_up(&server->response_q);
return rc;
}
+int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ /* If tcp session is not an dfs connection, then reconnect to last target server */
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!server->is_dfs_conn) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return __cifs_reconnect(server, mark_smb_session);
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ mutex_lock(&server->refpath_lock);
+ if (!server->origin_fullpath || !server->leaf_fullpath) {
+ mutex_unlock(&server->refpath_lock);
+ return __cifs_reconnect(server, mark_smb_session);
+ }
+ mutex_unlock(&server->refpath_lock);
+
+ return reconnect_dfs_server(server);
+}
+#else
+int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ return __cifs_reconnect(server, mark_smb_session);
+}
+#endif
+
static void
cifs_echo_request(struct work_struct *work)
{
@@ -485,15 +634,18 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
+ spin_lock(&cifs_tcp_ses_lock);
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
(!server->ops->can_echo || server->ops->can_echo(server)) &&
time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
- cifs_reconnect(server);
+ cifs_reconnect(server, false);
return true;
}
+ spin_unlock(&cifs_tcp_ses_lock);
return false;
}
@@ -527,7 +679,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
/* reconnect if no credits and no requests in flight */
if (zero_credits(server)) {
- cifs_reconnect(server);
+ cifs_reconnect(server, false);
return -ECONNABORTED;
}
@@ -538,13 +690,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
else
length = sock_recvmsg(server->ssocket, smb_msg, 0);
- if (server->tcpStatus == CifsExiting)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -ESHUTDOWN;
+ }
if (server->tcpStatus == CifsNeedReconnect) {
- cifs_reconnect(server);
+ spin_unlock(&cifs_tcp_ses_lock);
+ cifs_reconnect(server, false);
return -ECONNABORTED;
}
+ spin_unlock(&cifs_tcp_ses_lock);
if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -561,7 +718,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
if (length <= 0) {
cifs_dbg(FYI, "Received no data or error: %d\n", length);
- cifs_reconnect(server);
+ cifs_reconnect(server, false);
return -ECONNABORTED;
}
}
@@ -640,11 +797,11 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
* initialize frame).
*/
cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
break;
default:
cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
}
return false;
@@ -665,19 +822,20 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
* Trying to handle/dequeue a mid after the send_recv()
* function has finished processing it is a bug.
*/
- if (mid->mid_flags & MID_DELETED)
+ if (mid->mid_flags & MID_DELETED) {
+ spin_unlock(&GlobalMid_Lock);
pr_warn_once("trying to dequeue a deleted mid\n");
- else {
+ } else {
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
+ spin_unlock(&GlobalMid_Lock);
}
- spin_unlock(&GlobalMid_Lock);
}
static unsigned int
smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
/*
* SMB1 does not use credits.
@@ -721,9 +879,9 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cancel_delayed_work_sync(&server->echo);
cancel_delayed_work_sync(&server->resolve);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&cifs_tcp_ses_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
wake_up_all(&server->response_q);
/* check if we have blocked requests that need to free */
@@ -794,7 +952,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
- kfree(server->hostname);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ kfree(server->origin_fullpath);
+ kfree(server->leaf_fullpath);
+#endif
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -813,7 +974,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
server->vals->header_preamble_size) {
cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -ECONNABORTED;
}
@@ -860,7 +1021,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -1;
}
@@ -878,7 +1039,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static void
smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
int scredits, in_flight;
/*
@@ -895,7 +1056,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_hdr_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_server_dbg(FYI, "%s: added %u credits total=%d\n",
@@ -964,7 +1125,7 @@ next_pdu:
server->vals->header_preamble_size) {
cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
server->pdu_size);
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
continue;
}
@@ -1016,7 +1177,7 @@ next_pdu:
server->ops->is_status_io_timeout(buf)) {
num_io_timeout++;
if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) {
- cifs_reconnect(server);
+ cifs_reconnect(server, false);
num_io_timeout = 0;
continue;
}
@@ -1086,7 +1247,7 @@ next_pdu:
}
memalloc_noreclaim_restore(noreclaim_flag);
- module_put_and_exit(0);
+ module_put_and_kthread_exit(0);
}
/*
@@ -1221,6 +1382,10 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (ctx->nosharesock)
return 0;
+ /* this server does not share socket */
+ if (server->nosharesock)
+ return 0;
+
/* If multidialect negotiation see if existing sessions match one */
if (strcmp(ctx->vals->version_string, SMB3ANY_VERSION_STRING) == 0) {
if (server->vals->protocol_id < SMB30_PROT_ID)
@@ -1235,6 +1400,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
+ if (strcasecmp(server->hostname, ctx->server_hostname))
+ return 0;
+
if (!match_address(server, addr,
(struct sockaddr *)&ctx->srcaddr))
return 0;
@@ -1281,7 +1449,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (server->is_channel || !match_server(server, ctx))
+ if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx))
continue;
++server->srv_count;
@@ -1312,6 +1480,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
list_del_init(&server->tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* For secondary channels, we pick up ref-count on the primary server */
+ if (CIFS_SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server->primary_server, from_reconnect);
+
cancel_delayed_work_sync(&server->echo);
cancel_delayed_work_sync(&server->resolve);
@@ -1326,16 +1498,16 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
else
cancel_delayed_work_sync(&server->reconnect);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&cifs_tcp_ses_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_crypto_secmech_release(server);
- cifs_fscache_release_client_cookie(server);
kfree(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
+ kfree(server->hostname);
task = xchg(&server->tsk, NULL);
if (task)
@@ -1343,7 +1515,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
}
struct TCP_Server_Info *
-cifs_get_tcp_session(struct smb3_fs_context *ctx)
+cifs_get_tcp_session(struct smb3_fs_context *ctx,
+ struct TCP_Server_Info *primary_server)
{
struct TCP_Server_Info *tcp_ses = NULL;
int rc;
@@ -1361,14 +1534,18 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
goto out_err;
}
+ tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL);
+ if (!tcp_ses->hostname) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+
+ if (ctx->nosharesock)
+ tcp_ses->nosharesock = true;
+
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
- tcp_ses->hostname = extract_hostname(ctx->UNC);
- if (IS_ERR(tcp_ses->hostname)) {
- rc = PTR_ERR(tcp_ses->hostname);
- goto out_err_crypto_release;
- }
tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
tcp_ses->noblockcnt = ctx->rootfs;
@@ -1379,6 +1556,12 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
tcp_ses->in_flight = 0;
tcp_ses->max_in_flight = 0;
tcp_ses->credits = 1;
+ if (primary_server) {
+ spin_lock(&cifs_tcp_ses_lock);
+ ++primary_server->srv_count;
+ tcp_ses->primary_server = primary_server;
+ spin_unlock(&cifs_tcp_ses_lock);
+ }
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
@@ -1399,6 +1582,9 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server);
INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
mutex_init(&tcp_ses->reconnect_mutex);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ mutex_init(&tcp_ses->refpath_lock);
+#endif
memcpy(&tcp_ses->srcaddr, &ctx->srcaddr,
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &ctx->dstaddr,
@@ -1463,7 +1649,9 @@ smbd_connected:
* to the struct since the kernel thread not created yet
* no need to spinlock this update of tcpStatus
*/
+ spin_lock(&cifs_tcp_ses_lock);
tcp_ses->tcpStatus = CifsNeedNegotiate;
+ spin_unlock(&cifs_tcp_ses_lock);
if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
@@ -1477,8 +1665,6 @@ smbd_connected:
list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- cifs_fscache_get_client_cookie(tcp_ses);
-
/* queue echo request delayed work */
queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
@@ -1497,8 +1683,9 @@ out_err_crypto_release:
out_err:
if (tcp_ses) {
- if (!IS_ERR(tcp_ses->hostname))
- kfree(tcp_ses->hostname);
+ if (CIFS_SERVER_IS_CHAN(tcp_ses))
+ cifs_put_tcp_session(tcp_ses->primary_server, false);
+ kfree(tcp_ses->hostname);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -1516,8 +1703,12 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
* If an existing session is limited to less channels than
* requested, it should not be reused
*/
- if (ses->chan_max < ctx->max_channels)
+ spin_lock(&ses->chan_lock);
+ if (ses->chan_max < ctx->max_channels) {
+ spin_unlock(&ses->chan_lock);
return 0;
+ }
+ spin_unlock(&ses->chan_lock);
switch (ses->sectype) {
case Kerberos:
@@ -1652,6 +1843,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
void cifs_put_smb_ses(struct cifs_ses *ses)
{
unsigned int rc, xid;
+ unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
@@ -1668,15 +1860,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&cifs_tcp_ses_lock);
return;
}
- spin_unlock(&cifs_tcp_ses_lock);
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
- spin_lock(&GlobalMid_Lock);
if (ses->status == CifsGood)
ses->status = CifsExiting;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_free_ipc(ses);
@@ -1693,13 +1883,21 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->chan_lock);
+ chan_count = ses->chan_count;
+
/* close any extra channels */
- if (ses->chan_count > 1) {
+ if (chan_count > 1) {
int i;
- for (i = 1; i < ses->chan_count; i++)
+ for (i = 1; i < chan_count; i++) {
+ spin_unlock(&ses->chan_lock);
cifs_put_tcp_session(ses->chans[i].server, 0);
+ spin_lock(&ses->chan_lock);
+ ses->chans[i].server = NULL;
+ }
}
+ spin_unlock(&ses->chan_lock);
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
@@ -1839,6 +2037,19 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
}
+ ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL);
+ if (!ctx->workstation_name) {
+ cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n");
+ rc = -ENOMEM;
+ kfree(ctx->username);
+ ctx->username = NULL;
+ kfree_sensitive(ctx->password);
+ ctx->password = NULL;
+ kfree(ctx->domainname);
+ ctx->domainname = NULL;
+ goto out_key_put;
+ }
+
out_key_put:
up_read(&key->sem);
key_put(key);
@@ -1881,18 +2092,22 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
ses->status);
- mutex_lock(&ses->session_mutex);
- rc = cifs_negotiate_protocol(xid, ses);
- if (rc) {
- mutex_unlock(&ses->session_mutex);
- /* problem -- put our ses reference */
- cifs_put_smb_ses(ses);
- free_xid(xid);
- return ERR_PTR(rc);
- }
- if (ses->need_reconnect) {
+ spin_lock(&ses->chan_lock);
+ if (cifs_chan_needs_reconnect(ses, server)) {
+ spin_unlock(&ses->chan_lock);
cifs_dbg(FYI, "Session needs reconnect\n");
- rc = cifs_setup_session(xid, ses,
+
+ mutex_lock(&ses->session_mutex);
+ rc = cifs_negotiate_protocol(xid, ses, server);
+ if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ /* problem -- put our ses reference */
+ cifs_put_smb_ses(ses);
+ free_xid(xid);
+ return ERR_PTR(rc);
+ }
+
+ rc = cifs_setup_session(xid, ses, server,
ctx->local_nls);
if (rc) {
mutex_unlock(&ses->session_mutex);
@@ -1901,8 +2116,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
free_xid(xid);
return ERR_PTR(rc);
}
+ mutex_unlock(&ses->session_mutex);
+
+ spin_lock(&ses->chan_lock);
}
- mutex_unlock(&ses->session_mutex);
+ spin_unlock(&ses->chan_lock);
/* existing SMB ses has a server reference already */
cifs_put_tcp_session(server, 0);
@@ -1939,6 +2157,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
if (!ses->domainName)
goto get_ses_fail;
}
+ if (ctx->workstation_name) {
+ ses->workstation_name = kstrdup(ctx->workstation_name,
+ GFP_KERNEL);
+ if (!ses->workstation_name)
+ goto get_ses_fail;
+ }
if (ctx->domainauto)
ses->domainAuto = ctx->domainauto;
ses->cred_uid = ctx->cred_uid;
@@ -1946,26 +2170,35 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
ses->sectype = ctx->sectype;
ses->sign = ctx->sign;
- mutex_lock(&ses->session_mutex);
/* add server as first channel */
+ spin_lock(&ses->chan_lock);
ses->chans[0].server = server;
ses->chan_count = 1;
ses->chan_max = ctx->multichannel ? ctx->max_channels:1;
+ ses->chans_need_reconnect = 1;
+ spin_unlock(&ses->chan_lock);
- rc = cifs_negotiate_protocol(xid, ses);
+ mutex_lock(&ses->session_mutex);
+ rc = cifs_negotiate_protocol(xid, ses, server);
if (!rc)
- rc = cifs_setup_session(xid, ses, ctx->local_nls);
+ rc = cifs_setup_session(xid, ses, server, ctx->local_nls);
+ mutex_unlock(&ses->session_mutex);
/* each channel uses a different signing key */
+ spin_lock(&ses->chan_lock);
memcpy(ses->chans[0].signkey, ses->smb3signingkey,
sizeof(ses->smb3signingkey));
+ spin_unlock(&ses->chan_lock);
- mutex_unlock(&ses->session_mutex);
if (rc)
goto get_ses_fail;
- /* success, put it on the list and add it as first channel */
+ /*
+ * success, put it on the list and add it as first channel
+ * note: the session becomes active soon after this. So you'll
+ * need to lock before changing something in the session.
+ */
spin_lock(&cifs_tcp_ses_lock);
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -1984,7 +2217,7 @@ get_ses_fail:
static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
- if (tcon->tidStatus == CifsExiting)
+ if (tcon->status == TID_EXITING)
return 0;
if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE))
return 0;
@@ -2045,6 +2278,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
/* tc_count can never go negative */
WARN_ON(tcon->tc_count < 0);
+ list_del_init(&tcon->tcon_list);
+ spin_unlock(&cifs_tcp_ses_lock);
+
if (tcon->use_witness) {
int rc;
@@ -2055,9 +2291,6 @@ cifs_put_tcon(struct cifs_tcon *tcon)
}
}
- list_del_init(&tcon->tcon_list);
- spin_unlock(&cifs_tcp_ses_lock);
-
xid = get_xid();
if (ses->server->ops->tree_disconnect)
ses->server->ops->tree_disconnect(xid, tcon);
@@ -2167,17 +2400,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
if (ses->server->posix_ext_supported) {
tcon->posix_extensions = true;
pr_warn_once("SMB3.11 POSIX Extensions are experimental\n");
- } else {
+ } else if ((ses->server->vals->protocol_id == SMB311_PROT_ID) ||
+ (strcmp(ses->server->vals->version_string,
+ SMB3ANY_VERSION_STRING) == 0) ||
+ (strcmp(ses->server->vals->version_string,
+ SMBDEFAULT_VERSION_STRING) == 0)) {
cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n");
rc = -EOPNOTSUPP;
goto out_fail;
+ } else {
+ cifs_dbg(VFS, "Check vers= mount option. SMB3.11 "
+ "disabled but required for POSIX extensions\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
}
}
- /*
- * BB Do we need to wrap session_mutex around this TCon call and Unix
- * SetFS as we do on SessSetup and reconnect?
- */
xid = get_xid();
rc = ses->server->ops->tree_connect(xid, ses, ctx->UNC, tcon,
ctx->local_nls);
@@ -2283,8 +2521,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
- cifs_fscache_get_super_cookie(tcon);
-
return tcon;
out_fail:
@@ -2646,11 +2882,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = 0;
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+ trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
sock_release(socket);
server->ssocket = NULL;
return rc;
}
-
+ trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
if (sport == htons(RFC1001_PORT))
rc = ip_rfc1001_connect(server);
@@ -2845,73 +3082,64 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
}
/* Release all succeed connections */
-static inline void mount_put_conns(struct cifs_sb_info *cifs_sb,
- unsigned int xid,
- struct TCP_Server_Info *server,
- struct cifs_ses *ses, struct cifs_tcon *tcon)
+static inline void mount_put_conns(struct mount_ctx *mnt_ctx)
{
int rc = 0;
- if (tcon)
- cifs_put_tcon(tcon);
- else if (ses)
- cifs_put_smb_ses(ses);
- else if (server)
- cifs_put_tcp_session(server, 0);
- cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
- free_xid(xid);
+ if (mnt_ctx->tcon)
+ cifs_put_tcon(mnt_ctx->tcon);
+ else if (mnt_ctx->ses)
+ cifs_put_smb_ses(mnt_ctx->ses);
+ else if (mnt_ctx->server)
+ cifs_put_tcp_session(mnt_ctx->server, 0);
+ mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+ free_xid(mnt_ctx->xid);
}
/* Get connections for tcp, ses and tcon */
-static int mount_get_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb,
- unsigned int *xid,
- struct TCP_Server_Info **nserver,
- struct cifs_ses **nses, struct cifs_tcon **ntcon)
+static int mount_get_conns(struct mount_ctx *mnt_ctx)
{
int rc = 0;
- struct TCP_Server_Info *server;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
-
- *nserver = NULL;
- *nses = NULL;
- *ntcon = NULL;
+ struct TCP_Server_Info *server = NULL;
+ struct cifs_ses *ses = NULL;
+ struct cifs_tcon *tcon = NULL;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ unsigned int xid;
- *xid = get_xid();
+ xid = get_xid();
/* get a reference to a tcp session */
- server = cifs_get_tcp_session(ctx);
+ server = cifs_get_tcp_session(ctx, NULL);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
- return rc;
+ server = NULL;
+ goto out;
}
- *nserver = server;
-
/* get a reference to a SMB session */
ses = cifs_get_smb_ses(server, ctx);
if (IS_ERR(ses)) {
rc = PTR_ERR(ses);
- return rc;
+ ses = NULL;
+ goto out;
}
- *nses = ses;
-
if ((ctx->persistent == true) && (!(ses->server->capabilities &
SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
cifs_server_dbg(VFS, "persistent handles not supported by server\n");
- return -EOPNOTSUPP;
+ rc = -EOPNOTSUPP;
+ goto out;
}
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, ctx);
if (IS_ERR(tcon)) {
rc = PTR_ERR(tcon);
- return rc;
+ tcon = NULL;
+ goto out;
}
- *ntcon = tcon;
-
/* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
if (tcon->posix_extensions)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
@@ -2922,17 +3150,22 @@ static int mount_get_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cif
* reset of caps checks mount to see if unix extensions disabled
* for just this mount.
*/
- reset_cifs_unix_caps(*xid, tcon, cifs_sb, ctx);
+ reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx);
+ spin_lock(&cifs_tcp_ses_lock);
if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
(le64_to_cpu(tcon->fsUnixInfo.Capability) &
- CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP))
- return -EACCES;
+ CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ rc = -EACCES;
+ goto out;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
} else
tcon->unix_ext = 0; /* server does not support them */
/* do not care if a following call succeed - informational */
if (!tcon->pipe && server->ops->qfs_tcon) {
- server->ops->qfs_tcon(*xid, tcon, cifs_sb);
+ server->ops->qfs_tcon(xid, tcon, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
cpu_to_le32(FILE_READ_ONLY_DEVICE))
@@ -2956,7 +3189,21 @@ static int mount_get_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cif
(cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
- return 0;
+ /*
+ * The cookie is initialized from volume info returned above.
+ * Inside cifs_fscache_get_super_cookie it checks
+ * that we do not get super cookie twice.
+ */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+ cifs_fscache_get_super_cookie(tcon);
+
+out:
+ mnt_ctx->server = server;
+ mnt_ctx->ses = ses;
+ mnt_ctx->tcon = tcon;
+ mnt_ctx->xid = xid;
+
+ return rc;
}
static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
@@ -2986,18 +3233,17 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static int mount_get_dfs_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb,
- unsigned int *xid, struct TCP_Server_Info **nserver,
- struct cifs_ses **nses, struct cifs_tcon **ntcon)
+/* Get unique dfs connections */
+static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx)
{
int rc;
- ctx->nosharesock = true;
- rc = mount_get_conns(ctx, cifs_sb, xid, nserver, nses, ntcon);
- if (*nserver) {
+ mnt_ctx->fs_ctx->nosharesock = true;
+ rc = mount_get_conns(mnt_ctx);
+ if (mnt_ctx->server) {
cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
spin_lock(&cifs_tcp_ses_lock);
- (*nserver)->is_dfs_conn = true;
+ mnt_ctx->server->is_dfs_conn = true;
spin_unlock(&cifs_tcp_ses_lock);
}
return rc;
@@ -3039,190 +3285,38 @@ build_unc_path_to_root(const struct smb3_fs_context *ctx,
}
/*
- * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb
- *
- * If a referral is found, cifs_sb->ctx->mount_options will be (re-)allocated
- * to a string containing updated options for the submount. Otherwise it
- * will be left untouched.
+ * expand_dfs_referral - Update cifs_sb from dfs referral path
*
- * Returns the rc from get_dfs_path to the caller, which can be used to
- * determine whether there were referrals.
+ * cifs_sb->ctx->mount_options will be (re-)allocated to a string containing updated options for the
+ * submount. Otherwise it will be left untouched.
*/
-static int
-expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
- struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb,
- char *ref_path)
+static int expand_dfs_referral(struct mount_ctx *mnt_ctx, const char *full_path,
+ struct dfs_info3_param *referral)
{
int rc;
- struct dfs_info3_param referral = {0};
- char *full_path = NULL, *mdata = NULL;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
- return -EREMOTE;
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
- ref_path, &referral, NULL);
- if (!rc) {
- char *fake_devname = NULL;
-
- mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- full_path + 1, &referral,
- &fake_devname);
- free_dfs_info_param(&referral);
-
- if (IS_ERR(mdata)) {
- rc = PTR_ERR(mdata);
- mdata = NULL;
- } else {
- /*
- * We can not clear out the whole structure since we
- * no longer have an explicit function to parse
- * a mount-string. Instead we need to clear out the
- * individual fields that are no longer valid.
- */
- kfree(ctx->prepath);
- ctx->prepath = NULL;
- rc = cifs_setup_volume_info(ctx, mdata, fake_devname);
- }
- kfree(fake_devname);
- kfree(cifs_sb->ctx->mount_options);
- cifs_sb->ctx->mount_options = mdata;
- }
- kfree(full_path);
- return rc;
-}
-
-static int get_next_dfs_tgt(struct dfs_cache_tgt_list *tgt_list,
- struct dfs_cache_tgt_iterator **tgt_it)
-{
- if (!*tgt_it)
- *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
- else
- *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
- return !*tgt_it ? -EHOSTDOWN : 0;
-}
-
-static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
- struct smb3_fs_context *fake_ctx, struct smb3_fs_context *ctx)
-{
- const char *tgt = dfs_cache_get_tgt_name(tgt_it);
- int len = strlen(tgt) + 2;
- char *new_unc;
-
- new_unc = kmalloc(len, GFP_KERNEL);
- if (!new_unc)
- return -ENOMEM;
- scnprintf(new_unc, len, "\\%s", tgt);
-
- kfree(ctx->UNC);
- ctx->UNC = new_unc;
-
- if (fake_ctx->prepath) {
- kfree(ctx->prepath);
- ctx->prepath = fake_ctx->prepath;
- fake_ctx->prepath = NULL;
- }
- memcpy(&ctx->dstaddr, &fake_ctx->dstaddr, sizeof(ctx->dstaddr));
-
- return 0;
-}
-
-static int do_dfs_failover(const char *path, const char *full_path, struct cifs_sb_info *cifs_sb,
- struct smb3_fs_context *ctx, struct cifs_ses *root_ses,
- unsigned int *xid, struct TCP_Server_Info **server,
- struct cifs_ses **ses, struct cifs_tcon **tcon)
-{
- int rc;
- char *npath = NULL;
- struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
- struct dfs_cache_tgt_iterator *tgt_it = NULL;
- struct smb3_fs_context tmp_ctx = {NULL};
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
- return -EOPNOTSUPP;
-
- npath = dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb));
- if (IS_ERR(npath))
- return PTR_ERR(npath);
-
- cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, npath, full_path);
-
- rc = dfs_cache_noreq_find(npath, NULL, &tgt_list);
- if (rc)
- goto out;
- /*
- * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to
- * test connection against new DFS targets.
- */
- rc = smb3_fs_context_dup(&tmp_ctx, ctx);
- if (rc)
- goto out;
-
- for (;;) {
- struct dfs_info3_param ref = {0};
- char *fake_devname = NULL, *mdata = NULL;
-
- /* Get next DFS target server - if any */
- rc = get_next_dfs_tgt(&tgt_list, &tgt_it);
- if (rc)
- break;
-
- rc = dfs_cache_get_tgt_referral(npath, tgt_it, &ref);
- if (rc)
- break;
-
- cifs_dbg(FYI, "%s: old ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
- tmp_ctx.prepath);
-
- mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, &ref,
- &fake_devname);
- free_dfs_info_param(&ref);
-
- if (IS_ERR(mdata)) {
- rc = PTR_ERR(mdata);
- mdata = NULL;
- } else
- rc = cifs_setup_volume_info(&tmp_ctx, mdata, fake_devname);
-
- kfree(mdata);
- kfree(fake_devname);
-
- if (rc)
- break;
-
- cifs_dbg(FYI, "%s: new ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
- tmp_ctx.prepath);
-
- mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
- rc = mount_get_dfs_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon);
- if (!rc || (*server && *ses)) {
- /*
- * We were able to connect to new target server. Update current context with
- * new target server.
- */
- rc = update_vol_info(tgt_it, &tmp_ctx, ctx);
- break;
- }
- }
- if (!rc) {
- cifs_dbg(FYI, "%s: final ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
- tmp_ctx.prepath);
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ char *fake_devname = NULL, *mdata = NULL;
+
+ mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, referral,
+ &fake_devname);
+ if (IS_ERR(mdata)) {
+ rc = PTR_ERR(mdata);
+ mdata = NULL;
+ } else {
/*
- * Update DFS target hint in DFS referral cache with the target server we
- * successfully reconnected to.
+ * We can not clear out the whole structure since we no longer have an explicit
+ * function to parse a mount-string. Instead we need to clear out the individual
+ * fields that are no longer valid.
*/
- rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), path, tgt_it);
+ kfree(ctx->prepath);
+ ctx->prepath = NULL;
+ rc = cifs_setup_volume_info(ctx, mdata, fake_devname);
}
+ kfree(fake_devname);
+ kfree(cifs_sb->ctx->mount_options);
+ cifs_sb->ctx->mount_options = mdata;
-out:
- kfree(npath);
- smb3_cleanup_fs_context_contents(&tmp_ctx);
- dfs_cache_free_tgts(&tgt_list);
return rc;
}
#endif
@@ -3329,12 +3423,14 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
* Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
* otherwise 0.
*/
-static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
- const unsigned int xid,
- struct TCP_Server_Info *server,
- struct cifs_tcon *tcon)
+static int is_path_remote(struct mount_ctx *mnt_ctx)
{
int rc;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct TCP_Server_Info *server = mnt_ctx->server;
+ unsigned int xid = mnt_ctx->xid;
+ struct cifs_tcon *tcon = mnt_ctx->tcon;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
char *full_path;
if (!server->ops->is_path_accessible)
@@ -3352,6 +3448,11 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *
rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
full_path);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (rc == -ENOENT && is_tcon_dfs(tcon))
+ rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb,
+ full_path);
+#endif
if (rc != 0 && rc != -EREMOTE) {
kfree(full_path);
return rc;
@@ -3372,280 +3473,300 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static void set_root_ses(struct cifs_sb_info *cifs_sb, const uuid_t *mount_id, struct cifs_ses *ses,
- struct cifs_ses **root_ses)
+static void set_root_ses(struct mount_ctx *mnt_ctx)
{
- if (ses) {
+ if (mnt_ctx->ses) {
spin_lock(&cifs_tcp_ses_lock);
- ses->ses_count++;
+ mnt_ctx->ses->ses_count++;
spin_unlock(&cifs_tcp_ses_lock);
- dfs_cache_add_refsrv_session(mount_id, ses);
+ dfs_cache_add_refsrv_session(&mnt_ctx->mount_id, mnt_ctx->ses);
}
- *root_ses = ses;
+ mnt_ctx->root_ses = mnt_ctx->ses;
}
-/* Set up next dfs prefix path in @dfs_path */
-static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
- const unsigned int xid, struct TCP_Server_Info *server,
- struct cifs_tcon *tcon, char **dfs_path)
+static int is_dfs_mount(struct mount_ctx *mnt_ctx, bool *isdfs, struct dfs_cache_tgt_list *root_tl)
{
- char *path, *npath;
- int added_treename = is_tcon_dfs(tcon);
int rc;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- path = cifs_build_path_to_root(ctx, cifs_sb, tcon, added_treename);
- if (!path)
- return -ENOMEM;
+ *isdfs = true;
- rc = is_path_remote(cifs_sb, ctx, xid, server, tcon);
- if (rc == -EREMOTE) {
- struct smb3_fs_context v = {NULL};
- /* if @path contains a tree name, skip it in the prefix path */
- if (added_treename) {
- rc = smb3_parse_devname(path, &v);
- if (rc)
- goto out;
- npath = build_unc_path_to_root(&v, cifs_sb, true);
- smb3_cleanup_fs_context_contents(&v);
- } else {
- v.UNC = ctx->UNC;
- v.prepath = path + 1;
- npath = build_unc_path_to_root(&v, cifs_sb, true);
- }
+ rc = mount_get_conns(mnt_ctx);
+ /*
+ * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
+ * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
+ *
+ * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
+ * to respond with PATH_NOT_COVERED to requests that include the prefix.
+ */
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
+ dfs_cache_find(mnt_ctx->xid, mnt_ctx->ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ ctx->UNC + 1, NULL, root_tl)) {
+ if (rc)
+ return rc;
+ /* Check if it is fully accessible and then mount it */
+ rc = is_path_remote(mnt_ctx);
+ if (!rc)
+ *isdfs = false;
+ else if (rc != -EREMOTE)
+ return rc;
+ }
+ return 0;
+}
- if (IS_ERR(npath)) {
- rc = PTR_ERR(npath);
- goto out;
- }
+static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path,
+ const char *ref_path, struct dfs_cache_tgt_iterator *tit)
+{
+ int rc;
+ struct dfs_info3_param ref = {};
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ char *oldmnt = cifs_sb->ctx->mount_options;
+
+ cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path,
+ dfs_cache_get_tgt_name(tit));
+
+ rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref);
+ if (rc)
+ goto out;
+
+ rc = expand_dfs_referral(mnt_ctx, full_path, &ref);
+ if (rc)
+ goto out;
- kfree(*dfs_path);
- *dfs_path = npath;
- rc = -EREMOTE;
+ /* Connect to new target only if we were redirected (e.g. mount options changed) */
+ if (oldmnt != cifs_sb->ctx->mount_options) {
+ mount_put_conns(mnt_ctx);
+ rc = mount_get_dfs_conns(mnt_ctx);
+ }
+ if (!rc) {
+ if (cifs_is_referral_server(mnt_ctx->tcon, &ref))
+ set_root_ses(mnt_ctx);
+ rc = dfs_cache_update_tgthint(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), ref_path, tit);
}
out:
- kfree(path);
+ free_dfs_info_param(&ref);
return rc;
}
-/* Check if resolved targets can handle any DFS referrals */
-static int is_referral_server(const char *ref_path, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon, bool *ref_server)
+static int connect_dfs_root(struct mount_ctx *mnt_ctx, struct dfs_cache_tgt_list *root_tl)
{
int rc;
- struct dfs_info3_param ref = {0};
+ char *full_path;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct dfs_cache_tgt_iterator *tit;
- cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path);
+ /* Put initial connections as they might be shared with other mounts. We need unique dfs
+ * connections per mount to properly failover, so mount_get_dfs_conns() must be used from
+ * now on.
+ */
+ mount_put_conns(mnt_ctx);
+ mount_get_dfs_conns(mnt_ctx);
+ set_root_ses(mnt_ctx);
- if (is_tcon_dfs(tcon)) {
- *ref_server = true;
- } else {
- char *npath;
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path))
+ return PTR_ERR(full_path);
- npath = dfs_cache_canonical_path(ref_path, cifs_sb->local_nls, cifs_remap(cifs_sb));
- if (IS_ERR(npath))
- return PTR_ERR(npath);
+ mnt_ctx->origin_fullpath = dfs_cache_canonical_path(ctx->UNC, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (IS_ERR(mnt_ctx->origin_fullpath)) {
+ rc = PTR_ERR(mnt_ctx->origin_fullpath);
+ mnt_ctx->origin_fullpath = NULL;
+ goto out;
+ }
- rc = dfs_cache_noreq_find(npath, &ref, NULL);
- kfree(npath);
- if (rc) {
- cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc);
- return rc;
+ /* Try all dfs root targets */
+ for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(root_tl);
+ tit; tit = dfs_cache_get_next_tgt(root_tl, tit)) {
+ rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->origin_fullpath + 1, tit);
+ if (!rc) {
+ mnt_ctx->leaf_fullpath = kstrdup(mnt_ctx->origin_fullpath, GFP_KERNEL);
+ if (!mnt_ctx->leaf_fullpath)
+ rc = -ENOMEM;
+ break;
}
- cifs_dbg(FYI, "%s: ref.flags=0x%x\n", __func__, ref.flags);
- /*
- * Check if all targets are capable of handling DFS referrals as per
- * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL.
- */
- *ref_server = !!(ref.flags & DFSREF_REFERRAL_SERVER);
- free_dfs_info_param(&ref);
}
- return 0;
+
+out:
+ kfree(full_path);
+ return rc;
}
-int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
+static int __follow_dfs_link(struct mount_ctx *mnt_ctx)
{
- int rc = 0;
- unsigned int xid;
- struct TCP_Server_Info *server = NULL;
- struct cifs_ses *ses = NULL, *root_ses = NULL;
- struct cifs_tcon *tcon = NULL;
- int count = 0;
- uuid_t mount_id = {0};
- char *ref_path = NULL, *full_path = NULL;
- char *oldmnt = NULL;
- bool ref_server = false;
+ int rc;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ char *full_path;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ struct dfs_cache_tgt_iterator *tit;
- rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
- /*
- * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
- * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
- *
- * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
- * to respond with PATH_NOT_COVERED to requests that include the prefix.
- */
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
- dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL,
- NULL)) {
- if (rc)
- goto error;
- /* Check if it is fully accessible and then mount it */
- rc = is_path_remote(cifs_sb, ctx, xid, server, tcon);
- if (!rc)
- goto out;
- if (rc != -EREMOTE)
- goto error;
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path))
+ return PTR_ERR(full_path);
+
+ kfree(mnt_ctx->leaf_fullpath);
+ mnt_ctx->leaf_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (IS_ERR(mnt_ctx->leaf_fullpath)) {
+ rc = PTR_ERR(mnt_ctx->leaf_fullpath);
+ mnt_ctx->leaf_fullpath = NULL;
+ goto out;
}
- mount_put_conns(cifs_sb, xid, server, ses, tcon);
- /*
- * Ignore error check here because we may failover to other targets from cached a
- * referral.
+ /* Get referral from dfs link */
+ rc = dfs_cache_find(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), mnt_ctx->leaf_fullpath + 1, NULL, &tl);
+ if (rc)
+ goto out;
+
+ /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an
+ * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as
+ * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than
+ * STATUS_PATH_NOT_COVERED."
*/
- (void)mount_get_dfs_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
+ for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl);
+ tit; tit = dfs_cache_get_next_tgt(&tl, tit)) {
+ rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit);
+ if (!rc) {
+ rc = is_path_remote(mnt_ctx);
+ if (!rc || rc == -EREMOTE)
+ break;
+ }
+ }
- /* Get path of DFS root */
- ref_path = build_unc_path_to_root(ctx, cifs_sb, false);
- if (IS_ERR(ref_path)) {
- rc = PTR_ERR(ref_path);
- ref_path = NULL;
- goto error;
+out:
+ kfree(full_path);
+ dfs_cache_free_tgts(&tl);
+ return rc;
+}
+
+static int follow_dfs_link(struct mount_ctx *mnt_ctx)
+{
+ int rc;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ char *full_path;
+ int num_links = 0;
+
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path))
+ return PTR_ERR(full_path);
+
+ kfree(mnt_ctx->origin_fullpath);
+ mnt_ctx->origin_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ kfree(full_path);
+
+ if (IS_ERR(mnt_ctx->origin_fullpath)) {
+ rc = PTR_ERR(mnt_ctx->origin_fullpath);
+ mnt_ctx->origin_fullpath = NULL;
+ return rc;
}
- uuid_gen(&mount_id);
- set_root_ses(cifs_sb, &mount_id, ses, &root_ses);
do {
- /* Save full path of last DFS path we used to resolve final target server */
- kfree(full_path);
- full_path = build_unc_path_to_root(ctx, cifs_sb, !!count);
- if (IS_ERR(full_path)) {
- rc = PTR_ERR(full_path);
- full_path = NULL;
- break;
- }
- /* Chase referral */
- oldmnt = cifs_sb->ctx->mount_options;
- rc = expand_dfs_referral(xid, root_ses, ctx, cifs_sb, ref_path + 1);
- if (rc)
- break;
- /* Connect to new DFS target only if we were redirected */
- if (oldmnt != cifs_sb->ctx->mount_options) {
- mount_put_conns(cifs_sb, xid, server, ses, tcon);
- rc = mount_get_dfs_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
- }
- if (rc && !server && !ses) {
- /* Failed to connect. Try to connect to other targets in the referral. */
- rc = do_dfs_failover(ref_path + 1, full_path, cifs_sb, ctx, root_ses, &xid,
- &server, &ses, &tcon);
- }
- if (rc == -EACCES || rc == -EOPNOTSUPP || !server || !ses)
+ rc = __follow_dfs_link(mnt_ctx);
+ if (!rc || rc != -EREMOTE)
break;
- if (!tcon)
- continue;
+ } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
- /* Make sure that requests go through new root servers */
- rc = is_referral_server(ref_path + 1, cifs_sb, tcon, &ref_server);
- if (rc)
- break;
- if (ref_server)
- set_root_ses(cifs_sb, &mount_id, ses, &root_ses);
+ return rc;
+}
+
+/* Set up DFS referral paths for failover */
+static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
+{
+ struct TCP_Server_Info *server = mnt_ctx->server;
+
+ mutex_lock(&server->refpath_lock);
+ server->origin_fullpath = mnt_ctx->origin_fullpath;
+ server->leaf_fullpath = mnt_ctx->leaf_fullpath;
+ server->current_fullpath = mnt_ctx->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
+ mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
+}
- /* Get next dfs path and then continue chasing them if -EREMOTE */
- rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path);
- /* Prevent recursion on broken link referrals */
- if (rc == -EREMOTE && ++count > MAX_NESTED_LINKS)
- rc = -ELOOP;
- } while (rc == -EREMOTE);
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
+{
+ int rc;
+ struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ bool isdfs;
- if (rc || !tcon || !ses)
+ rc = is_dfs_mount(&mnt_ctx, &isdfs, &tl);
+ if (rc)
goto error;
+ if (!isdfs)
+ goto out;
- kfree(ref_path);
- /*
- * Store DFS full path in both superblock and tree connect structures.
- *
- * For DFS root mounts, the prefix path (cifs_sb->prepath) is preserved during reconnect so
- * only the root path is set in cifs_sb->origin_fullpath and tcon->dfs_path. And for DFS
- * links, the prefix path is included in both and may be changed during reconnect. See
- * cifs_tree_connect().
- */
- ref_path = dfs_cache_canonical_path(full_path, cifs_sb->local_nls, cifs_remap(cifs_sb));
- kfree(full_path);
- full_path = NULL;
+ uuid_gen(&mnt_ctx.mount_id);
+ rc = connect_dfs_root(&mnt_ctx, &tl);
+ dfs_cache_free_tgts(&tl);
- if (IS_ERR(ref_path)) {
- rc = PTR_ERR(ref_path);
- ref_path = NULL;
+ if (rc)
goto error;
- }
- cifs_sb->origin_fullpath = ref_path;
- ref_path = kstrdup(cifs_sb->origin_fullpath, GFP_KERNEL);
- if (!ref_path) {
- rc = -ENOMEM;
+ rc = is_path_remote(&mnt_ctx);
+ if (rc)
+ rc = follow_dfs_link(&mnt_ctx);
+ if (rc)
goto error;
- }
- spin_lock(&cifs_tcp_ses_lock);
- tcon->dfs_path = ref_path;
- ref_path = NULL;
- spin_unlock(&cifs_tcp_ses_lock);
+ setup_server_referral_paths(&mnt_ctx);
/*
- * After reconnecting to a different server, unique ids won't
- * match anymore, so we disable serverino. This prevents
- * dentry revalidation to think the dentry are stale (ESTALE).
+ * After reconnecting to a different server, unique ids won't match anymore, so we disable
+ * serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
*/
cifs_autodisable_serverino(cifs_sb);
/*
- * Force the use of prefix path to support failover on DFS paths that
- * resolve to targets that have different prefix paths.
+ * Force the use of prefix path to support failover on DFS paths that resolve to targets
+ * that have different prefix paths.
*/
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
kfree(cifs_sb->prepath);
cifs_sb->prepath = ctx->prepath;
ctx->prepath = NULL;
- uuid_copy(&cifs_sb->dfs_mount_id, &mount_id);
+ uuid_copy(&cifs_sb->dfs_mount_id, &mnt_ctx.mount_id);
out:
- free_xid(xid);
- cifs_try_adding_channels(cifs_sb, ses);
- return mount_setup_tlink(cifs_sb, ses, tcon);
+ free_xid(mnt_ctx.xid);
+ cifs_try_adding_channels(cifs_sb, mnt_ctx.ses);
+ return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
error:
- kfree(ref_path);
- kfree(full_path);
- kfree(cifs_sb->origin_fullpath);
- dfs_cache_put_refsrv_sessions(&mount_id);
- mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id);
+ kfree(mnt_ctx.origin_fullpath);
+ kfree(mnt_ctx.leaf_fullpath);
+ mount_put_conns(&mnt_ctx);
return rc;
}
#else
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
int rc = 0;
- unsigned int xid;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
+ struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
+ rc = mount_get_conns(&mnt_ctx);
if (rc)
goto error;
- if (tcon) {
- rc = is_path_remote(cifs_sb, ctx, xid, server, tcon);
+ if (mnt_ctx.tcon) {
+ rc = is_path_remote(&mnt_ctx);
if (rc == -EREMOTE)
rc = -EOPNOTSUPP;
if (rc)
goto error;
}
- free_xid(xid);
-
- return mount_setup_tlink(cifs_sb, ses, tcon);
+ free_xid(mnt_ctx.xid);
+ return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
error:
- mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ mount_put_conns(&mnt_ctx);
return rc;
}
#endif
@@ -3729,8 +3850,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
if (rc == 0) {
bool is_unicode;
- tcon->tidStatus = CifsGood;
- tcon->need_reconnect = false;
tcon->tid = smb_buffer_response->Tid;
bcc_ptr = pByteArea(smb_buffer_response);
bytes_left = get_bcc(smb_buffer_response);
@@ -3814,32 +3933,42 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
kfree(cifs_sb->prepath);
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_cache_put_refsrv_sessions(&cifs_sb->dfs_mount_id);
- kfree(cifs_sb->origin_fullpath);
#endif
call_rcu(&cifs_sb->rcu, delayed_free);
}
int
-cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
+cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
int rc = 0;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
/* only send once per connect */
- if (!server->ops->need_neg(server))
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!server->ops->need_neg(server) ||
+ server->tcpStatus != CifsNeedNegotiate) {
+ spin_unlock(&cifs_tcp_ses_lock);
return 0;
+ }
+ server->tcpStatus = CifsInNegotiate;
+ spin_unlock(&cifs_tcp_ses_lock);
- rc = server->ops->negotiate(xid, ses);
+ rc = server->ops->negotiate(xid, ses, server);
if (rc == 0) {
- spin_lock(&GlobalMid_Lock);
- if (server->tcpStatus == CifsNeedNegotiate)
- server->tcpStatus = CifsGood;
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsInNegotiate)
+ server->tcpStatus = CifsNeedSessSetup;
else
rc = -EHOSTDOWN;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsInNegotiate)
+ server->tcpStatus = CifsNeedNegotiate;
+ spin_unlock(&cifs_tcp_ses_lock);
}
return rc;
@@ -3847,12 +3976,27 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
int
cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
+ bool is_binding = false;
+
+ /* only send once per connect */
+ spin_lock(&cifs_tcp_ses_lock);
+ if ((server->tcpStatus != CifsNeedSessSetup) &&
+ (ses->status == CifsGood)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
+ server->tcpStatus = CifsInSessSetup;
+ spin_unlock(&cifs_tcp_ses_lock);
- if (!ses->binding) {
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ spin_unlock(&ses->chan_lock);
+
+ if (!is_binding) {
ses->capabilities = server->capabilities;
if (!linuxExtEnabled)
ses->capabilities &= (~server->vals->cap_unix);
@@ -3870,10 +4014,26 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
server->sec_mode, server->capabilities, server->timeAdj);
if (server->ops->sess_setup)
- rc = server->ops->sess_setup(xid, ses, nls_info);
+ rc = server->ops->sess_setup(xid, ses, server, nls_info);
- if (rc)
+ if (rc) {
cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsInSessSetup)
+ server->tcpStatus = CifsNeedSessSetup;
+ spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsInSessSetup)
+ server->tcpStatus = CifsGood;
+ /* Even if one channel is active, session is in good state */
+ ses->status = CifsGood;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ spin_lock(&ses->chan_lock);
+ cifs_chan_clear_need_reconnect(ses, server);
+ spin_unlock(&ses->chan_lock);
+ }
return rc;
}
@@ -4141,111 +4301,293 @@ cifs_prune_tlinks(struct work_struct *work)
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
+/* Update dfs referral path of superblock */
+static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
+ const char *target)
+{
+ int rc = 0;
+ size_t len = strlen(target);
+ char *refpath, *npath;
+
+ if (unlikely(len < 2 || *target != '\\'))
+ return -EINVAL;
+
+ if (target[1] == '\\') {
+ len += 1;
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "%s", target);
+ } else {
+ len += sizeof("\\");
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "\\%s", target);
+ }
+
+ npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ kfree(refpath);
+
+ if (IS_ERR(npath)) {
+ rc = PTR_ERR(npath);
+ } else {
+ mutex_lock(&server->refpath_lock);
+ kfree(server->leaf_fullpath);
+ server->leaf_fullpath = npath;
+ mutex_unlock(&server->refpath_lock);
+ server->current_fullpath = server->leaf_fullpath;
+ }
+ return rc;
+}
+
+static int target_share_matches_server(struct TCP_Server_Info *server, const char *tcp_host,
+ size_t tcp_host_len, char *share, bool *target_match)
+{
+ int rc = 0;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ *target_match = true;
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
+
+ /* Check if hostnames or addresses match */
+ if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+ cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len,
+ dfs_host, (int)tcp_host_len, tcp_host);
+ rc = match_target_ip(server, dfs_host, dfs_host_len, target_match);
+ if (rc)
+ cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
+ }
+ return rc;
+}
+
+static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
{
int rc;
struct TCP_Server_Info *server = tcon->ses->server;
const struct smb_version_operations *ops = server->ops;
- struct dfs_cache_tgt_list tl;
- struct dfs_cache_tgt_iterator *it = NULL;
- char *tree;
+ struct cifs_tcon *ipc = tcon->ses->tcon_ipc;
+ char *share = NULL, *prefix = NULL;
const char *tcp_host;
size_t tcp_host_len;
- const char *dfs_host;
- size_t dfs_host_len;
- char *share = NULL, *prefix = NULL;
- struct dfs_info3_param ref = {0};
- bool isroot;
+ struct dfs_cache_tgt_iterator *tit;
+ bool target_match;
- tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
- if (!tree)
- return -ENOMEM;
+ extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len);
- /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
- if (!tcon->dfs_path || dfs_cache_noreq_find(tcon->dfs_path + 1, &ref, &tl)) {
- if (tcon->ipc) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
- } else {
- rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
- }
+ tit = dfs_cache_get_tgt_iterator(tl);
+ if (!tit) {
+ rc = -ENOENT;
goto out;
}
- isroot = ref.server_type == DFS_TYPE_ROOT;
- free_dfs_info_param(&ref);
-
- extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len);
-
- for (it = dfs_cache_get_tgt_iterator(&tl); it; it = dfs_cache_get_next_tgt(&tl, it)) {
- bool target_match;
+ /* Try to tree connect to all dfs targets */
+ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
+ const char *target = dfs_cache_get_tgt_name(tit);
+ struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
kfree(share);
kfree(prefix);
- share = NULL;
- prefix = NULL;
+ share = prefix = NULL;
- rc = dfs_cache_get_tgt_share(tcon->dfs_path + 1, it, &share, &prefix);
+ /* Check if share matches with tcp ses */
+ rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix);
if (rc) {
- cifs_dbg(VFS, "%s: failed to parse target share %d\n",
- __func__, rc);
- continue;
+ cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc);
+ break;
}
- extract_unc_hostname(share, &dfs_host, &dfs_host_len);
-
- if (dfs_host_len != tcp_host_len
- || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
- cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len,
- dfs_host, (int)tcp_host_len, tcp_host);
+ rc = target_share_matches_server(server, tcp_host, tcp_host_len, share,
+ &target_match);
+ if (rc)
+ break;
+ if (!target_match) {
+ rc = -EHOSTUNREACH;
+ continue;
+ }
- rc = match_target_ip(server, dfs_host, dfs_host_len, &target_match);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
+ if (ipc->need_reconnect) {
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls);
+ if (rc)
break;
- }
-
- if (!target_match) {
- cifs_dbg(FYI, "%s: skipping target\n", __func__);
- continue;
- }
}
- if (tcon->ipc) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", share);
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
+ if (!islink) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ break;
+ }
+ /*
+ * If no dfs referrals were returned from link target, then just do a TREE_CONNECT
+ * to it. Otherwise, cache the dfs referral and then mark current tcp ses for
+ * reconnect so either the demultiplex thread or the echo worker will reconnect to
+ * newly resolved target.
+ */
+ if (dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target,
+ NULL, &ntl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ if (rc)
+ continue;
+ rc = dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit);
+ if (!rc)
+ rc = cifs_update_super_prepath(cifs_sb, prefix);
} else {
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
- /* Only handle prefix paths of DFS link targets */
- if (!rc && !isroot) {
- rc = update_super_prepath(tcon, prefix);
- break;
- }
+ /* Target is another dfs share */
+ rc = update_server_fullpath(server, cifs_sb, target);
+ dfs_cache_free_tgts(tl);
+
+ if (!rc) {
+ rc = -EREMOTE;
+ list_replace_init(&ntl.tl_list, &tl->tl_list);
+ } else
+ dfs_cache_free_tgts(&ntl);
}
- if (rc == -EREMOTE)
- break;
+ break;
}
+out:
kfree(share);
kfree(prefix);
- if (!rc) {
- if (it)
- rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1, it);
- else
- rc = -ENOENT;
+ return rc;
+}
+
+static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ int num_links = 0;
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ do {
+ rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl);
+ if (!rc || rc != -EREMOTE)
+ break;
+ } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
+ /*
+ * If we couldn't tree connect to any targets from last referral path, then retry from
+ * original referral path.
+ */
+ if (rc && server->current_fullpath != server->origin_fullpath) {
+ server->current_fullpath = server->origin_fullpath;
+ cifs_signal_cifsd_for_reconnect(server, true);
}
- dfs_cache_free_tgts(&tl);
+
+ dfs_cache_free_tgts(tl);
+ return rc;
+}
+
+int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
+{
+ int rc;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ const struct smb_version_operations *ops = server->ops;
+ struct super_block *sb = NULL;
+ struct cifs_sb_info *cifs_sb;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ char *tree;
+ struct dfs_info3_param ref = {0};
+
+ /* only send once per connect */
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->ses->status != CifsGood ||
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
+ tcon->status = TID_IN_TCON;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (tcon->ipc) {
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
+ goto out;
+ }
+
+ sb = cifs_get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
+ cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc);
+ goto out;
+ }
+
+ cifs_sb = CIFS_SB(sb);
+
+ /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
+ if (!server->current_fullpath ||
+ dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls);
+ goto out;
+ }
+
+ rc = tree_connect_dfs_target(xid, tcon, cifs_sb, tree, ref.server_type == DFS_TYPE_LINK,
+ &tl);
+ free_dfs_info_param(&ref);
+
out:
kfree(tree);
+ cifs_put_tcp_super(sb);
+
+ if (rc) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
+ spin_unlock(&cifs_tcp_ses_lock);
+ tcon->need_reconnect = false;
+ }
+
return rc;
}
#else
int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
{
+ int rc;
const struct smb_version_operations *ops = tcon->ses->server->ops;
- return ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
+ /* only send once per connect */
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->ses->status != CifsGood ||
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
+ tcon->status = TID_IN_TCON;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
+ if (rc) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
+ spin_unlock(&cifs_tcp_ses_lock);
+ tcon->need_reconnect = false;
+ }
+
+ return rc;
}
#endif
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 283745592844..956f8e5cf3e7 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -283,7 +283,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v)
seq_printf(m,
"cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n",
ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
- ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags,
+ ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags,
IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no");
@@ -1355,18 +1355,13 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- for (i = 0; i < tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
-static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct cifs_tcon *tcon,
+ bool force_refresh)
{
- const char *path = tcon->dfs_path + 1;
struct cifs_ses *ses;
struct cache_entry *ce;
struct dfs_info3_param *refs = NULL;
@@ -1422,6 +1417,22 @@ out:
return rc;
}
+static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ mutex_lock(&server->refpath_lock);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
+ }
+ mutex_unlock(&server->refpath_lock);
+
+ return 0;
+}
+
/**
* dfs_cache_remount_fs - remount a DFS share
*
@@ -1435,6 +1446,7 @@ out:
int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
{
struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
struct mount_group *mg;
struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
int rc;
@@ -1443,13 +1455,15 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
return -EINVAL;
tcon = cifs_sb_master_tcon(cifs_sb);
- if (!tcon->dfs_path) {
- cifs_dbg(FYI, "%s: not a dfs tcon\n", __func__);
+ server = tcon->ses->server;
+
+ if (!server->origin_fullpath) {
+ cifs_dbg(FYI, "%s: not a dfs mount\n", __func__);
return 0;
}
if (uuid_is_null(&cifs_sb->dfs_mount_id)) {
- cifs_dbg(FYI, "%s: tcon has no dfs mount group id\n", __func__);
+ cifs_dbg(FYI, "%s: no dfs mount group id\n", __func__);
return -EINVAL;
}
@@ -1457,7 +1471,7 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
mg = find_mount_group_locked(&cifs_sb->dfs_mount_id);
if (IS_ERR(mg)) {
mutex_unlock(&mount_group_list_lock);
- cifs_dbg(FYI, "%s: tcon has ipc session to refresh referral\n", __func__);
+ cifs_dbg(FYI, "%s: no ipc session for refreshing referral\n", __func__);
return PTR_ERR(mg);
}
kref_get(&mg->refcount);
@@ -1498,9 +1512,12 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ if (!server->is_dfs_conn)
+ continue;
+
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- if (tcon->dfs_path) {
+ if (!tcon->ipc && !tcon->need_reconnect) {
tcon->tc_count++;
list_add_tail(&tcon->ulist, &tcons);
}
@@ -1510,8 +1527,19 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_unlock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
+ struct TCP_Server_Info *server = tcon->ses->server;
+
list_del_init(&tcon->ulist);
- refresh_tcon(sessions, tcon, false);
+
+ mutex_lock(&server->refpath_lock);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
+ }
+ mutex_unlock(&server->refpath_lock);
+
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6e8e7cc26ae2..ce9b22aecfba 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -22,6 +22,7 @@
#include "cifs_unicode.h"
#include "fs_context.h"
#include "cifs_ioctl.h"
+#include "fscache.h"
static void
renew_parental_timestamps(struct dentry *direntry)
@@ -507,8 +508,12 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
server->ops->close(xid, tcon, &fid);
cifs_del_pending_open(&open);
rc = -ENOMEM;
+ goto out;
}
+ fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
+ file->f_mode & FMODE_WRITE);
+
out:
cifs_put_tlink(tlink);
out_free_xid:
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 13f3182cf796..d511a78383c3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -376,8 +376,6 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
struct cifsLockInfo *li, *tmp;
struct super_block *sb = inode->i_sb;
- cifs_fscache_release_inode_cookie(inode);
-
/*
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
@@ -570,7 +568,7 @@ int cifs_open(struct inode *inode, struct file *file)
spin_lock(&CIFS_I(inode)->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(inode)->deferred_lock);
- goto out;
+ goto use_cache;
} else {
_cifsFileInfo_put(cfile, true, false);
}
@@ -632,8 +630,6 @@ int cifs_open(struct inode *inode, struct file *file)
goto out;
}
- cifs_fscache_set_inode_cookie(inode, file);
-
if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
/*
* Time to set mode which we can not set earlier due to
@@ -652,6 +648,15 @@ int cifs_open(struct inode *inode, struct file *file)
cfile->pid);
}
+use_cache:
+ fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
+ file->f_mode & FMODE_WRITE);
+ if (file->f_flags & O_DIRECT &&
+ (!((file->f_flags & O_ACCMODE) != O_RDONLY) ||
+ file->f_flags & O_APPEND))
+ cifs_invalidate_cache(file_inode(file),
+ FSCACHE_INVAL_DIO_WRITE);
+
out:
free_dentry_path(page);
free_xid(xid);
@@ -876,6 +881,8 @@ int cifs_close(struct inode *inode, struct file *file)
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_deferred_close *dclose;
+ cifs_fscache_unuse_inode_cookie(inode, file->f_mode & FMODE_WRITE);
+
if (file->private_data != NULL) {
cfile = file->private_data;
file->private_data = NULL;
@@ -886,7 +893,6 @@ int cifs_close(struct inode *inode, struct file *file)
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
inode->i_ctime = inode->i_mtime = current_time(inode);
- cifs_fscache_update_inode_cookie(inode);
}
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
@@ -2692,12 +2698,23 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
server = tcon->ses->server;
- if (server->ops->flush)
- rc = server->ops->flush(xid, tcon, &smbfile->fid);
- else
+ if (server->ops->flush == NULL) {
rc = -ENOSYS;
+ goto strict_fsync_exit;
+ }
+
+ if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) {
+ smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY);
+ if (smbfile) {
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
+ cifsFileInfo_put(smbfile);
+ } else
+ cifs_dbg(FYI, "ignore fsync for file not open for write\n");
+ } else
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
}
+strict_fsync_exit:
free_xid(xid);
return rc;
}
@@ -2709,6 +2726,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct cifsFileInfo *smbfile = file->private_data;
+ struct inode *inode = file_inode(file);
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
rc = file_write_and_wait_range(file, start, end);
@@ -2725,12 +2743,23 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
server = tcon->ses->server;
- if (server->ops->flush)
- rc = server->ops->flush(xid, tcon, &smbfile->fid);
- else
+ if (server->ops->flush == NULL) {
rc = -ENOSYS;
+ goto fsync_exit;
+ }
+
+ if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) {
+ smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY);
+ if (smbfile) {
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
+ cifsFileInfo_put(smbfile);
+ } else
+ cifs_dbg(FYI, "ignore fsync for file not open for write\n");
+ } else
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
}
+fsync_exit:
free_xid(xid);
return rc;
}
@@ -3184,7 +3213,7 @@ restart_loop:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
@@ -3711,6 +3740,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
@@ -3917,7 +3951,7 @@ again:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
@@ -4175,12 +4209,20 @@ static vm_fault_t
cifs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct file *file = vmf->vma->vm_file;
- struct inode *inode = file_inode(file);
- cifs_fscache_wait_on_page_write(inode, page);
+ /* Wait for the page to be written to the cache before we allow it to
+ * be modified. We then assume the entire page will need writing back.
+ */
+#ifdef CONFIG_CIFS_FSCACHE
+ if (PageFsCache(page) &&
+ wait_on_page_fscache_killable(page) < 0)
+ return VM_FAULT_RETRY;
+#endif
- lock_page(page);
+ wait_on_page_writeback(page);
+
+ if (lock_page_killable(page) < 0)
+ return VM_FAULT_RETRY;
return VM_FAULT_LOCKED;
}
@@ -4238,8 +4280,6 @@ cifs_readv_complete(struct work_struct *work)
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
- lru_cache_add(page);
-
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes)) {
flush_dcache_page(page);
@@ -4247,13 +4287,11 @@ cifs_readv_complete(struct work_struct *work)
} else
SetPageError(page);
- unlock_page(page);
-
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
- else
- cifs_fscache_uncache_page(rdata->mapping->host, page);
+
+ unlock_page(page);
got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
@@ -4311,7 +4349,6 @@ readpages_fill_pages(struct TCP_Server_Info *server,
* fill them until the writes are flushed.
*/
zero_user(page, 0, PAGE_SIZE);
- lru_cache_add(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
@@ -4321,7 +4358,6 @@ readpages_fill_pages(struct TCP_Server_Info *server,
continue;
} else {
/* no need to hold page hostage */
- lru_cache_add(page);
unlock_page(page);
put_page(page);
rdata->pages[i] = NULL;
@@ -4364,92 +4400,20 @@ cifs_readpages_copy_into_pages(struct TCP_Server_Info *server,
return readpages_fill_pages(server, rdata, iter, iter->count);
}
-static int
-readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
- unsigned int rsize, struct list_head *tmplist,
- unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+static void cifs_readahead(struct readahead_control *ractl)
{
- struct page *page, *tpage;
- unsigned int expected_index;
int rc;
- gfp_t gfp = readahead_gfp_mask(mapping);
-
- INIT_LIST_HEAD(tmplist);
-
- page = lru_to_page(page_list);
-
- /*
- * Lock the page and put it in the cache. Since no one else
- * should have access to this page, we're safe to simply set
- * PG_locked without checking it first.
- */
- __SetPageLocked(page);
- rc = add_to_page_cache_locked(page, mapping,
- page->index, gfp);
-
- /* give up if we can't stick it in the cache */
- if (rc) {
- __ClearPageLocked(page);
- return rc;
- }
-
- /* move first page to the tmplist */
- *offset = (loff_t)page->index << PAGE_SHIFT;
- *bytes = PAGE_SIZE;
- *nr_pages = 1;
- list_move_tail(&page->lru, tmplist);
-
- /* now try and add more pages onto the request */
- expected_index = page->index + 1;
- list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
- /* discontinuity ? */
- if (page->index != expected_index)
- break;
-
- /* would this page push the read over the rsize? */
- if (*bytes + PAGE_SIZE > rsize)
- break;
-
- __SetPageLocked(page);
- rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
- if (rc) {
- __ClearPageLocked(page);
- break;
- }
- list_move_tail(&page->lru, tmplist);
- (*bytes) += PAGE_SIZE;
- expected_index++;
- (*nr_pages)++;
- }
- return rc;
-}
-
-static int cifs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned num_pages)
-{
- int rc;
- int err = 0;
- struct list_head tmplist;
- struct cifsFileInfo *open_file = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+ struct cifsFileInfo *open_file = ractl->file->private_data;
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file);
struct TCP_Server_Info *server;
pid_t pid;
- unsigned int xid;
+ unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0;
+ pgoff_t next_cached = ULONG_MAX;
+ bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) &&
+ cifs_inode_cookie(ractl->mapping->host)->cache_priv;
+ bool check_cache = caching;
xid = get_xid();
- /*
- * Reads as many pages as possible from fscache. Returns -ENOBUFS
- * immediately if the cookie is negative
- *
- * After this point, every page in the list might have PG_fscache set,
- * so we will need to clean that up off of every page we don't use.
- */
- rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
- &num_pages);
- if (rc == 0) {
- free_xid(xid);
- return rc;
- }
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -4460,39 +4424,78 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
- __func__, file, mapping, num_pages);
+ __func__, ractl->file, ractl->mapping, readahead_count(ractl));
/*
- * Start with the page at end of list and move it to private
- * list. Do the same with any following pages until we hit
- * the rsize limit, hit an index discontinuity, or run out of
- * pages. Issue the async read and then start the loop again
- * until the list is empty.
- *
- * Note that list order is important. The page_list is in
- * the order of declining indexes. When we put the pages in
- * the rdata->pages, then we want them in increasing order.
+ * Chop the readahead request up into rsize-sized read requests.
*/
- while (!list_empty(page_list) && !err) {
- unsigned int i, nr_pages, bytes, rsize;
- loff_t offset;
- struct page *page, *tpage;
+ while ((nr_pages = readahead_count(ractl) - last_batch_size)) {
+ unsigned int i, got, rsize;
+ struct page *page;
struct cifs_readdata *rdata;
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
+ pgoff_t index = readahead_index(ractl) + last_batch_size;
+
+ /*
+ * Find out if we have anything cached in the range of
+ * interest, and if so, where the next chunk of cached data is.
+ */
+ if (caching) {
+ if (check_cache) {
+ rc = cifs_fscache_query_occupancy(
+ ractl->mapping->host, index, nr_pages,
+ &next_cached, &cache_nr_pages);
+ if (rc < 0)
+ caching = false;
+ check_cache = false;
+ }
+
+ if (index == next_cached) {
+ /*
+ * TODO: Send a whole batch of pages to be read
+ * by the cache.
+ */
+ page = readahead_page(ractl);
+ last_batch_size = 1 << thp_order(page);
+ if (cifs_readpage_from_fscache(ractl->mapping->host,
+ page) < 0) {
+ /*
+ * TODO: Deal with cache read failure
+ * here, but for the moment, delegate
+ * that to readpage.
+ */
+ caching = false;
+ }
+ unlock_page(page);
+ next_cached++;
+ cache_nr_pages--;
+ if (cache_nr_pages == 0)
+ check_cache = true;
+ continue;
+ }
+ }
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
+ if (rc) {
+ if (rc == -EAGAIN)
+ continue;
break;
+ }
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
break;
+ nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl));
+ nr_pages = min_t(size_t, nr_pages, next_cached - index);
/*
* Give up immediately if rsize is too small to read an entire
@@ -4500,16 +4503,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* reach this point however since we set ra_pages to 0 when the
* rsize is smaller than a cache page.
*/
- if (unlikely(rsize < PAGE_SIZE)) {
- add_credits_and_wake_if(server, credits, 0);
- free_xid(xid);
- return 0;
- }
-
- nr_pages = 0;
- err = readpages_get_pages(mapping, page_list, rsize, &tmplist,
- &nr_pages, &offset, &bytes);
- if (!nr_pages) {
+ if (unlikely(!nr_pages)) {
add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -4517,36 +4511,31 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
if (!rdata) {
/* best to give up if we're out of mem */
- list_for_each_entry_safe(page, tpage, &tmplist, lru) {
- list_del(&page->lru);
- lru_cache_add(page);
- unlock_page(page);
- put_page(page);
- }
- rc = -ENOMEM;
add_credits_and_wake_if(server, credits, 0);
break;
}
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->server = server;
- rdata->mapping = mapping;
- rdata->offset = offset;
- rdata->bytes = bytes;
- rdata->pid = pid;
- rdata->pagesz = PAGE_SIZE;
- rdata->tailsz = PAGE_SIZE;
+ got = __readahead_batch(ractl, rdata->pages, nr_pages);
+ if (got != nr_pages) {
+ pr_warn("__readahead_batch() returned %u/%u\n",
+ got, nr_pages);
+ nr_pages = got;
+ }
+
+ rdata->nr_pages = nr_pages;
+ rdata->bytes = readahead_batch_length(ractl);
+ rdata->cfile = cifsFileInfo_get(open_file);
+ rdata->server = server;
+ rdata->mapping = ractl->mapping;
+ rdata->offset = readahead_pos(ractl);
+ rdata->pid = pid;
+ rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits_on_stack;
-
- list_for_each_entry_safe(page, tpage, &tmplist, lru) {
- list_del(&page->lru);
- rdata->pages[rdata->nr_pages++] = page;
- }
+ rdata->credits = credits_on_stack;
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
-
if (!rc) {
if (rdata->cfile->invalidHandle)
rc = -EAGAIN;
@@ -4558,7 +4547,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
- lru_cache_add(page);
unlock_page(page);
put_page(page);
}
@@ -4568,15 +4556,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
}
kref_put(&rdata->refcount, cifs_readdata_release);
+ last_batch_size = nr_pages;
}
- /* Any pages that have been shown to fscache but didn't get added to
- * the pagecache must be uncached before they get returned to the
- * allocator.
- */
- cifs_fscache_readpages_cancel(mapping->host, page_list);
free_xid(xid);
- return rc;
}
/*
@@ -4778,24 +4761,26 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
{
if (PagePrivate(page))
return 0;
-
- return cifs_fscache_release_page(page, gfp);
+ if (PageFsCache(page)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
+ }
+ fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
+ return true;
}
-static void cifs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void cifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
-
- if (offset == 0 && length == PAGE_SIZE)
- cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+ folio_wait_fscache(folio);
}
-static int cifs_launder_page(struct page *page)
+static int cifs_launder_folio(struct folio *folio)
{
int rc = 0;
- loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
+ loff_t range_start = folio_pos(folio);
+ loff_t range_end = range_start + folio_size(folio);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -4803,12 +4788,12 @@ static int cifs_launder_page(struct page *page)
.range_end = range_end,
};
- cifs_dbg(FYI, "Launder page: %p\n", page);
+ cifs_dbg(FYI, "Launder page: %lu\n", folio->index);
- if (clear_page_dirty_for_io(page))
- rc = cifs_writepage_locked(page, &wbc);
+ if (folio_clear_dirty_for_io(folio))
+ rc = cifs_writepage_locked(&folio->page, &wbc);
- cifs_fscache_invalidate_page(page, page->mapping->host);
+ folio_wait_fscache(folio);
return rc;
}
@@ -4898,7 +4883,7 @@ oplock_break_done:
* In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
* so this method should never be called.
*
- * Direct IO is not yet supported in the cached mode.
+ * Direct IO is not yet supported in the cached mode.
*/
static ssize_t
cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
@@ -4965,18 +4950,32 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
+/*
+ * Mark a page as having been made dirty and thus needing writeback. We also
+ * need to pin the cache object to write back to.
+ */
+#ifdef CONFIG_CIFS_FSCACHE
+static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ return fscache_dirty_folio(mapping, folio,
+ cifs_inode_cookie(mapping->host));
+}
+#else
+#define cifs_dirty_folio filemap_dirty_folio
+#endif
+
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
- .readpages = cifs_readpages,
+ .readahead = cifs_readahead,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
.direct_IO = cifs_direct_io,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
/*
* TODO: investigate and if useful we could add an cifs_migratePage
* helper (under an CONFIG_MIGRATION) in the future, and also
@@ -4997,8 +4996,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 3109def8e199..a92e9eec521f 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -37,6 +37,8 @@
#include "rfc1002pdu.h"
#include "fs_context.h"
+static DEFINE_MUTEX(cifs_mount_mutex);
+
static const match_table_t cifs_smb_version_tokens = {
{ Smb_1, SMB1_VERSION_STRING },
{ Smb_20, SMB20_VERSION_STRING},
@@ -116,6 +118,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("nosharesock", Opt_nosharesock),
fsparam_flag_no("persistenthandles", Opt_persistent),
fsparam_flag_no("resilienthandles", Opt_resilient),
+ fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
fsparam_flag("domainauto", Opt_domainauto),
fsparam_flag("rdma", Opt_rdma),
fsparam_flag("modesid", Opt_modesid),
@@ -146,7 +149,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
- fsparam_u32("snapshot", Opt_snapshot),
+ fsparam_u64("snapshot", Opt_snapshot),
fsparam_u32("max_channels", Opt_max_channels),
/* Mount options which take string value */
@@ -307,7 +310,9 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->nodename = NULL;
new_ctx->username = NULL;
new_ctx->password = NULL;
+ new_ctx->server_hostname = NULL;
new_ctx->domainname = NULL;
+ new_ctx->workstation_name = NULL;
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
@@ -318,9 +323,11 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(mount_options);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
+ DUP_CTX_STR(server_hostname);
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
+ DUP_CTX_STR(workstation_name);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
@@ -430,6 +437,42 @@ out:
}
/*
+ * Remove duplicate path delimiters. Windows is supposed to do that
+ * but there are some bugs that prevent rename from working if there are
+ * multiple delimiters.
+ *
+ * Returns a sanitized duplicate of @path. The caller is responsible for
+ * cleaning up the original.
+ */
+#define IS_DELIM(c) ((c) == '/' || (c) == '\\')
+static char *sanitize_path(char *path)
+{
+ char *cursor1 = path, *cursor2 = path;
+
+ /* skip all prepended delimiters */
+ while (IS_DELIM(*cursor1))
+ cursor1++;
+
+ /* copy the first letter */
+ *cursor2 = *cursor1;
+
+ /* copy the remainder... */
+ while (*(cursor1++)) {
+ /* ... skipping all duplicated delimiters */
+ if (IS_DELIM(*cursor1) && IS_DELIM(*cursor2))
+ continue;
+ *(++cursor2) = *cursor1;
+ }
+
+ /* if the last character is a delimiter, skip it */
+ if (IS_DELIM(*(cursor2 - 1)))
+ cursor2--;
+
+ *(cursor2) = '\0';
+ return kstrdup(path, GFP_KERNEL);
+}
+
+/*
* Parse a devname into substrings and populate the ctx->UNC and ctx->prepath
* fields with the result. Returns 0 on success and an error otherwise
* (e.g. ENOMEM or EINVAL)
@@ -456,6 +499,12 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (!pos)
return -EINVAL;
+ /* record the server hostname */
+ kfree(ctx->server_hostname);
+ ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL);
+ if (!ctx->server_hostname)
+ return -ENOMEM;
+
/* skip past delimiter */
++pos;
@@ -482,7 +531,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (!*pos)
return 0;
- ctx->prepath = kstrdup(pos, GFP_KERNEL);
+ ctx->prepath = sanitize_path(pos);
if (!ctx->prepath)
return -ENOMEM;
@@ -660,10 +709,14 @@ static int smb3_get_tree_common(struct fs_context *fc)
static int smb3_get_tree(struct fs_context *fc)
{
int err = smb3_fs_context_validate(fc);
+ int ret;
if (err)
return err;
- return smb3_get_tree_common(fc);
+ mutex_lock(&cifs_mount_mutex);
+ ret = smb3_get_tree_common(fc);
+ mutex_unlock(&cifs_mount_mutex);
+ return ret;
}
static void smb3_fs_context_free(struct fs_context *fc)
@@ -713,6 +766,11 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change domainname during remount\n");
return -EINVAL;
}
+ if (new_ctx->workstation_name &&
+ (!old_ctx->workstation_name || strcmp(new_ctx->workstation_name, old_ctx->workstation_name))) {
+ cifs_errorf(fc, "can not change workstation_name during remount\n");
+ return -EINVAL;
+ }
if (new_ctx->nodename &&
(!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) {
cifs_errorf(fc, "can not change nodename during remount\n");
@@ -746,7 +804,8 @@ static int smb3_reconfigure(struct fs_context *fc)
return rc;
/*
- * We can not change UNC/username/password/domainname/nodename/iocharset
+ * We can not change UNC/username/password/domainname/
+ * workstation_name/nodename/iocharset
* during reconnect so ignore what we have in the new context and
* just use what we already have in cifs_sb->ctx.
*/
@@ -755,6 +814,7 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, username);
STEAL_STRING(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
+ STEAL_STRING(cifs_sb, ctx, workstation_name);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -1018,7 +1078,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
- ctx->snapshot_time = result.uint_32;
+ ctx->snapshot_time = result.uint_64;
break;
case Opt_max_credits:
if (result.uint_32 < 20 || result.uint_32 > 60000) {
@@ -1383,6 +1443,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
}
break;
+ case Opt_tcp_nodelay:
+ /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */
+ if (result.negated)
+ ctx->sockopt_tcp_nodelay = false;
+ else
+ ctx->sockopt_tcp_nodelay = true;
+ break;
case Opt_domainauto:
ctx->domainauto = true;
break;
@@ -1400,13 +1467,22 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int smb3_init_fs_context(struct fs_context *fc)
{
+ int rc;
struct smb3_fs_context *ctx;
char *nodename = utsname()->nodename;
int i;
ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL);
- if (unlikely(!ctx))
- return -ENOMEM;
+ if (unlikely(!ctx)) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ ctx->workstation_name = kstrdup(nodename, GFP_KERNEL);
+ if (unlikely(!ctx->workstation_name)) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
/*
* does not have to be perfect mapping since field is
@@ -1479,6 +1555,14 @@ int smb3_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &smb3_fs_context_ops;
return 0;
+
+err_exit:
+ if (ctx) {
+ kfree(ctx->workstation_name);
+ kfree(ctx);
+ }
+
+ return rc;
}
void
@@ -1496,12 +1580,16 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->username = NULL;
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree(ctx->server_hostname);
+ ctx->server_hostname = NULL;
kfree(ctx->UNC);
ctx->UNC = NULL;
kfree(ctx->source);
ctx->source = NULL;
kfree(ctx->domainname);
ctx->domainname = NULL;
+ kfree(ctx->workstation_name);
+ ctx->workstation_name = NULL;
kfree(ctx->nodename);
ctx->nodename = NULL;
kfree(ctx->iocharset);
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index a42ba71d7a81..e54090d9ef36 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -98,6 +98,7 @@ enum cifs_param {
Opt_nosharesock,
Opt_persistent,
Opt_resilient,
+ Opt_tcp_nodelay,
Opt_domainauto,
Opt_rdma,
Opt_modesid,
@@ -166,8 +167,10 @@ struct smb3_fs_context {
char *password;
char *domainname;
char *source;
+ char *server_hostname;
char *UNC;
char *nodename;
+ char *workstation_name;
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 8eedd20c44ab..a638b29e9062 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -12,350 +12,242 @@
#include "cifs_fs_sb.h"
#include "cifsproto.h"
-/*
- * Key layout of CIFS server cache index object
- */
-struct cifs_server_key {
- struct {
- uint16_t family; /* address family */
- __be16 port; /* IP port */
- } hdr;
- union {
- struct in_addr ipv4_addr;
- struct in6_addr ipv6_addr;
- };
-} __packed;
-
-/*
- * Get a cookie for a server object keyed by {IPaddress,port,family} tuple
- */
-void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+static void cifs_fscache_fill_volume_coherency(
+ struct cifs_tcon *tcon,
+ struct cifs_fscache_volume_coherency_data *cd)
{
- const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
- const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
- const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
- struct cifs_server_key key;
- uint16_t key_len = sizeof(key.hdr);
+ memset(cd, 0, sizeof(*cd));
+ cd->resource_id = cpu_to_le64(tcon->resource_id);
+ cd->vol_create_time = tcon->vol_create_time;
+ cd->vol_serial_number = cpu_to_le32(tcon->vol_serial_number);
+}
- memset(&key, 0, sizeof(key));
+int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
+{
+ struct cifs_fscache_volume_coherency_data cd;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ struct fscache_volume *vcookie;
+ const struct sockaddr *sa = (struct sockaddr *)&server->dstaddr;
+ size_t slen, i;
+ char *sharename;
+ char *key;
+ int ret = -ENOMEM;
- /*
- * Should not be a problem as sin_family/sin6_family overlays
- * sa_family field
- */
- key.hdr.family = sa->sa_family;
+ tcon->fscache = NULL;
switch (sa->sa_family) {
case AF_INET:
- key.hdr.port = addr->sin_port;
- key.ipv4_addr = addr->sin_addr;
- key_len += sizeof(key.ipv4_addr);
- break;
-
case AF_INET6:
- key.hdr.port = addr6->sin6_port;
- key.ipv6_addr = addr6->sin6_addr;
- key_len += sizeof(key.ipv6_addr);
break;
-
default:
cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
- server->fscache = NULL;
- return;
+ return -EINVAL;
}
- server->fscache =
- fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
- &cifs_fscache_server_index_def,
- &key, key_len,
- NULL, 0,
- server, 0, true);
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
- __func__, server, server->fscache);
-}
-
-void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
-{
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
- __func__, server, server->fscache);
- fscache_relinquish_cookie(server->fscache, NULL, false);
- server->fscache = NULL;
-}
-
-void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
-{
- struct TCP_Server_Info *server = tcon->ses->server;
- char *sharename;
- struct cifs_fscache_super_auxdata auxdata;
+ memset(&key, 0, sizeof(key));
sharename = extract_sharename(tcon->treeName);
if (IS_ERR(sharename)) {
cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
- tcon->fscache = NULL;
- return;
+ return -EINVAL;
+ }
+
+ slen = strlen(sharename);
+ for (i = 0; i < slen; i++)
+ if (sharename[i] == '/')
+ sharename[i] = ';';
+
+ key = kasprintf(GFP_KERNEL, "cifs,%pISpc,%s", sa, sharename);
+ if (!key)
+ goto out;
+
+ cifs_fscache_fill_volume_coherency(tcon, &cd);
+ vcookie = fscache_acquire_volume(key,
+ NULL, /* preferred_cache */
+ &cd, sizeof(cd));
+ cifs_dbg(FYI, "%s: (%s/0x%p)\n", __func__, key, vcookie);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ ret = PTR_ERR(vcookie);
+ goto out_2;
+ }
+ pr_err("Cache volume key already in use (%s)\n", key);
+ vcookie = NULL;
}
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.resource_id = tcon->resource_id;
- auxdata.vol_create_time = tcon->vol_create_time;
- auxdata.vol_serial_number = tcon->vol_serial_number;
-
- tcon->fscache =
- fscache_acquire_cookie(server->fscache,
- &cifs_fscache_super_index_def,
- sharename, strlen(sharename),
- &auxdata, sizeof(auxdata),
- tcon, 0, true);
+ tcon->fscache = vcookie;
+ ret = 0;
+out_2:
+ kfree(key);
+out:
kfree(sharename);
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
- __func__, server->fscache, tcon->fscache);
+ return ret;
}
void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
{
- struct cifs_fscache_super_auxdata auxdata;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.resource_id = tcon->resource_id;
- auxdata.vol_create_time = tcon->vol_create_time;
- auxdata.vol_serial_number = tcon->vol_serial_number;
+ struct cifs_fscache_volume_coherency_data cd;
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
- fscache_relinquish_cookie(tcon->fscache, &auxdata, false);
- tcon->fscache = NULL;
-}
-static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi,
- struct cifs_tcon *tcon)
-{
- struct cifs_fscache_inode_auxdata auxdata;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
- auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
- auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
-
- cifsi->fscache =
- fscache_acquire_cookie(tcon->fscache,
- &cifs_fscache_inode_object_def,
- &cifsi->uniqueid, sizeof(cifsi->uniqueid),
- &auxdata, sizeof(auxdata),
- cifsi, cifsi->vfs_inode.i_size, true);
+ cifs_fscache_fill_volume_coherency(tcon, &cd);
+ fscache_relinquish_volume(tcon->fscache, &cd, false);
+ tcon->fscache = NULL;
}
-static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+void cifs_fscache_get_inode_cookie(struct inode *inode)
{
+ struct cifs_fscache_inode_coherency_data cd;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- if (cifsi->fscache)
- return;
-
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE))
- return;
-
- cifs_fscache_acquire_inode_cookie(cifsi, tcon);
-
- cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
- __func__, tcon->fscache, cifsi->fscache);
-}
-
-void cifs_fscache_release_inode_cookie(struct inode *inode)
-{
- struct cifs_fscache_inode_auxdata auxdata;
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
- if (cifsi->fscache) {
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
- auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
- auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
-
- cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- /* fscache_relinquish_cookie does not seem to update auxdata */
- fscache_update_cookie(cifsi->fscache, &auxdata);
- fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
- cifsi->fscache = NULL;
- }
+ cifsi->netfs_ctx.cache =
+ fscache_acquire_cookie(tcon->fscache, 0,
+ &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &cd, sizeof(cd),
+ i_size_read(&cifsi->vfs_inode));
}
-void cifs_fscache_update_inode_cookie(struct inode *inode)
+void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
{
- struct cifs_fscache_inode_auxdata auxdata;
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
- if (cifsi->fscache) {
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
- auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
- auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
+ if (update) {
+ struct cifs_fscache_inode_coherency_data cd;
+ loff_t i_size = i_size_read(inode);
- cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- fscache_update_cookie(cifsi->fscache, &auxdata);
+ cifs_fscache_fill_coherency(inode, &cd);
+ fscache_unuse_cookie(cifs_inode_cookie(inode), &cd, &i_size);
+ } else {
+ fscache_unuse_cookie(cifs_inode_cookie(inode), NULL, NULL);
}
}
-void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
-{
- cifs_fscache_enable_inode_cookie(inode);
-}
-
-void cifs_fscache_reset_inode_cookie(struct inode *inode)
+void cifs_fscache_release_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- struct fscache_cookie *old = cifsi->fscache;
-
- if (cifsi->fscache) {
- /* retire the current fscache cache and get a new one */
- fscache_relinquish_cookie(cifsi->fscache, NULL, true);
-
- cifs_fscache_acquire_inode_cookie(cifsi, tcon);
- cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
- __func__, cifsi->fscache, old);
- }
-}
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
-int cifs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- if (PageFsCache(page)) {
- struct inode *inode = page->mapping->host;
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
- __func__, page, cifsi->fscache);
- if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
- return 0;
+ if (cookie) {
+ cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);
+ fscache_relinquish_cookie(cookie, false);
+ cifsi->netfs_ctx.cache = NULL;
}
-
- return 1;
-}
-
-static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
- int error)
-{
- cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error);
- if (!error)
- SetPageUptodate(page);
- unlock_page(page);
}
/*
- * Retrieve a page from FS-Cache
+ * Fallback page reading interface.
*/
-int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+static int fscache_fallback_read_page(struct inode *inode, struct page *page)
{
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
int ret;
- cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
- __func__, CIFS_I(inode)->fscache, page, inode);
- ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
- cifs_readpage_from_fscache_complete,
- NULL,
- GFP_KERNEL);
- switch (ret) {
-
- case 0: /* page found in fscache, read submitted */
- cifs_dbg(FYI, "%s: submitted\n", __func__);
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+
+ ret = fscache_begin_read_operation(&cres, cookie);
+ if (ret < 0)
return ret;
- case -ENOBUFS: /* page won't be cached */
- case -ENODATA: /* page not in cache */
- cifs_dbg(FYI, "%s: %d\n", __func__, ret);
- return 1;
- default:
- cifs_dbg(VFS, "unknown error ret = %d\n", ret);
- }
+ ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
+ NULL, NULL);
+ fscache_end_operation(&cres);
return ret;
}
/*
- * Retrieve a set of pages from FS-Cache
+ * Fallback page writing interface.
*/
-int __cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
+static int fscache_fallback_write_page(struct inode *inode, struct page *page,
+ bool no_space_allocated_yet)
+{
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
+ loff_t start = page_offset(page);
+ size_t len = PAGE_SIZE;
int ret;
- cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
- __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
- ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
- pages, nr_pages,
- cifs_readpage_from_fscache_complete,
- NULL,
- mapping_gfp_mask(mapping));
- switch (ret) {
- case 0: /* read submitted to the cache for all pages */
- cifs_dbg(FYI, "%s: submitted\n", __func__);
- return ret;
-
- case -ENOBUFS: /* some pages are not cached and can't be */
- case -ENODATA: /* some pages are not cached */
- cifs_dbg(FYI, "%s: no page\n", __func__);
- return 1;
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
- default:
- cifs_dbg(FYI, "unknown error ret = %d\n", ret);
- }
+ ret = fscache_begin_write_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
+ ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ no_space_allocated_yet);
+ if (ret == 0)
+ ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
+ fscache_end_operation(&cres);
return ret;
}
-void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
int ret;
- WARN_ON(!cifsi->fscache);
+ cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+ __func__, cifs_inode_cookie(inode), page, inode);
- cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
- __func__, cifsi->fscache, page, inode);
- ret = fscache_write_page(cifsi->fscache, page,
- cifsi->vfs_inode.i_size, GFP_KERNEL);
- if (ret != 0)
- fscache_uncache_page(cifsi->fscache, page);
-}
+ ret = fscache_fallback_read_page(inode, page);
+ if (ret < 0)
+ return ret;
-void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
-{
- cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
- __func__, CIFS_I(inode)->fscache, inode);
- fscache_readpages_cancel(CIFS_I(inode)->fscache, pages);
+ /* Read completed synchronously */
+ SetPageUptodate(page);
+ return 0;
}
-void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
- struct fscache_cookie *cookie = cifsi->fscache;
+ cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
+ __func__, cifs_inode_cookie(inode), page, inode);
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
- fscache_wait_on_page_write(cookie, page);
- fscache_uncache_page(cookie, page);
+ fscache_fallback_write_page(inode, page, true);
}
-void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
-{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
- struct fscache_cookie *cookie = cifsi->fscache;
+/*
+ * Query the cache occupancy.
+ */
+int __cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
+{
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ loff_t start, data_start;
+ size_t len, data_len;
+ int ret;
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
- fscache_wait_on_page_write(cookie, page);
-}
+ ret = fscache_begin_read_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
-void __cifs_fscache_uncache_page(struct inode *inode, struct page *page)
-{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
- struct fscache_cookie *cookie = cifsi->fscache;
+ start = first * PAGE_SIZE;
+ len = nr_pages * PAGE_SIZE;
+ ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE,
+ &data_start, &data_len);
+ if (ret == 0) {
+ *_data_first = data_start / PAGE_SIZE;
+ *_data_nr_pages = len / PAGE_SIZE;
+ }
- cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
- fscache_uncache_page(cookie, page);
+ fscache_end_operation(&cres);
+ return ret;
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 9baa1d0f22bd..52355c0912ae 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -9,173 +9,154 @@
#ifndef _CIFS_FSCACHE_H
#define _CIFS_FSCACHE_H
+#include <linux/swap.h>
#include <linux/fscache.h>
#include "cifsglob.h"
-#ifdef CONFIG_CIFS_FSCACHE
-
/*
- * Auxiliary data attached to CIFS superblock within the cache
+ * Coherency data attached to CIFS volume within the cache
*/
-struct cifs_fscache_super_auxdata {
- u64 resource_id; /* unique server resource id */
+struct cifs_fscache_volume_coherency_data {
+ __le64 resource_id; /* unique server resource id */
__le64 vol_create_time;
- u32 vol_serial_number;
+ __le32 vol_serial_number;
} __packed;
/*
- * Auxiliary data attached to CIFS inode within the cache
+ * Coherency data attached to CIFS inode within the cache.
*/
-struct cifs_fscache_inode_auxdata {
- u64 last_write_time_sec;
- u64 last_change_time_sec;
- u32 last_write_time_nsec;
- u32 last_change_time_nsec;
- u64 eof;
+struct cifs_fscache_inode_coherency_data {
+ __le64 last_write_time_sec;
+ __le64 last_change_time_sec;
+ __le32 last_write_time_nsec;
+ __le32 last_change_time_nsec;
};
-/*
- * cache.c
- */
-extern struct fscache_netfs cifs_fscache_netfs;
-extern const struct fscache_cookie_def cifs_fscache_server_index_def;
-extern const struct fscache_cookie_def cifs_fscache_super_index_def;
-extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
-
-extern int cifs_fscache_register(void);
-extern void cifs_fscache_unregister(void);
+#ifdef CONFIG_CIFS_FSCACHE
/*
* fscache.c
*/
-extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
-extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
-extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
+extern int cifs_fscache_get_super_cookie(struct cifs_tcon *);
extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
+extern void cifs_fscache_get_inode_cookie(struct inode *inode);
extern void cifs_fscache_release_inode_cookie(struct inode *);
-extern void cifs_fscache_update_inode_cookie(struct inode *inode);
-extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
-extern void cifs_fscache_reset_inode_cookie(struct inode *);
-
-extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
-extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page);
-extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page);
-extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
-extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
-extern int __cifs_readpages_from_fscache(struct inode *,
- struct address_space *,
- struct list_head *,
- unsigned *);
-extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *);
-
-extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
-
-static inline void cifs_fscache_invalidate_page(struct page *page,
- struct inode *inode)
+extern void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update);
+
+static inline
+void cifs_fscache_fill_coherency(struct inode *inode,
+ struct cifs_fscache_inode_coherency_data *cd)
{
- if (PageFsCache(page))
- __cifs_fscache_invalidate_page(page, inode);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ memset(cd, 0, sizeof(*cd));
+ cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec);
}
-static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
+
+static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
- if (PageFsCache(page))
- __cifs_fscache_wait_on_page_write(inode, page);
+ return netfs_i_cookie(inode);
}
-static inline void cifs_fscache_uncache_page(struct inode *inode,
- struct page *page)
+static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags)
{
- if (PageFsCache(page))
- __cifs_fscache_uncache_page(inode, page);
+ struct cifs_fscache_inode_coherency_data cd;
+
+ cifs_fscache_fill_coherency(inode, &cd);
+ fscache_invalidate(cifs_inode_cookie(inode), &cd,
+ i_size_read(inode), flags);
}
-static inline int cifs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- if (CIFS_I(inode)->fscache)
- return __cifs_readpage_from_fscache(inode, page);
+extern int __cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages);
- return -ENOBUFS;
+static inline int cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
+{
+ if (!cifs_inode_cookie(inode))
+ return -ENOBUFS;
+ return __cifs_fscache_query_occupancy(inode, first, nr_pages,
+ _data_first, _data_nr_pages);
}
-static inline int cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage);
+extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage);
+
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
{
- if (CIFS_I(inode)->fscache)
- return __cifs_readpages_from_fscache(inode, mapping, pages,
- nr_pages);
+ if (cifs_inode_cookie(inode))
+ return __cifs_readpage_from_fscache(inode, page);
return -ENOBUFS;
}
static inline void cifs_readpage_to_fscache(struct inode *inode,
struct page *page)
{
- if (PageFsCache(page))
+ if (cifs_inode_cookie(inode))
__cifs_readpage_to_fscache(inode, page);
}
-static inline void cifs_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
{
- if (CIFS_I(inode)->fscache)
- return __cifs_fscache_readpages_cancel(inode, pages);
+ if (PageFsCache(page)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
+ fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
+ }
+ return true;
}
#else /* CONFIG_CIFS_FSCACHE */
-static inline int cifs_fscache_register(void) { return 0; }
-static inline void cifs_fscache_unregister(void) {}
-
-static inline void
-cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
-static inline void
-cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
-static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
-static inline void
-cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
-
-static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
-static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {}
-static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
- struct file *filp) {}
-static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
-static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+static inline
+void cifs_fscache_fill_coherency(struct inode *inode,
+ struct cifs_fscache_inode_coherency_data *cd)
{
- return 1; /* May release page */
}
-static inline void cifs_fscache_invalidate_page(struct page *page,
- struct inode *inode) {}
-static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page) {}
-static inline void cifs_fscache_uncache_page(struct inode *inode,
- struct page *page) {}
+static inline int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { return 0; }
+static inline void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
-static inline int
-cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+static inline void cifs_fscache_get_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {}
+static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; }
+static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
+
+static inline int cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
{
+ *_data_first = ULONG_MAX;
+ *_data_nr_pages = 0;
return -ENOBUFS;
}
-static inline int cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
return -ENOBUFS;
}
-static inline void cifs_readpage_to_fscache(struct inode *inode,
- struct page *page) {}
+static inline
+void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {}
-static inline void cifs_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
+ return true; /* May release page */
}
#endif /* CONFIG_CIFS_FSCACHE */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82848412ad85..2f9e7d2f81b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode)
inode->i_fop = &cifs_file_ops;
}
- /* check if server can support readpages */
+ /* check if server can support readahead */
if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read <
PAGE_SIZE + MAX_CIFS_HDR_SIZE)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
@@ -83,6 +83,7 @@ static void cifs_set_ops(struct inode *inode)
static void
cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
{
+ struct cifs_fscache_inode_coherency_data cd;
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
@@ -113,6 +114,9 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
__func__, cifs_i->uniqueid);
set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
+ /* Invalidate fscache cookie */
+ cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd);
+ fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
}
/*
@@ -952,6 +956,12 @@ cifs_get_inode_info(struct inode **inode,
rc = server->ops->query_path_info(xid, tcon, cifs_sb,
full_path, tmp_data,
&adjust_tz, &is_reparse_point);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (rc == -ENOENT && is_tcon_dfs(tcon))
+ rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon,
+ cifs_sb,
+ full_path);
+#endif
data = tmp_data;
}
@@ -1298,10 +1308,7 @@ retry_iget5_locked:
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
-#ifdef CONFIG_CIFS_FSCACHE
- /* initialize per-inode cache cookie pointer */
- CIFS_I(inode)->fscache = NULL;
-#endif
+ cifs_fscache_get_inode_cookie(inode);
unlock_new_inode(inode);
}
}
@@ -1356,11 +1363,6 @@ iget_no_retry:
goto out;
}
-#ifdef CONFIG_CIFS_FSCACHE
- /* populate tcon->resource_id */
- tcon->resource_id = CIFS_I(inode)->uniqueid;
-#endif
-
if (rc && tcon->pipe) {
cifs_dbg(FYI, "ipc connection - fake read inode\n");
spin_lock(&inode->i_lock);
@@ -2272,7 +2274,6 @@ cifs_invalidate_mapping(struct inode *inode)
__func__, inode);
}
- cifs_fscache_reset_inode_cookie(inode);
return rc;
}
@@ -2777,8 +2778,10 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
goto out;
if ((attrs->ia_valid & ATTR_SIZE) &&
- attrs->ia_size != i_size_read(inode))
+ attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
+ }
setattr_copy(&init_user_ns, inode, attrs);
mark_inode_dirty(inode);
@@ -2973,8 +2976,10 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
goto cifs_setattr_exit;
if ((attrs->ia_valid & ATTR_SIZE) &&
- attrs->ia_size != i_size_read(inode))
+ attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
+ }
setattr_copy(&init_user_ns, inode, attrs);
mark_inode_dirty(inode);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 852e54ee82c2..bbdf3281559c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
if (rc != 1)
return -EINVAL;
+ if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+ return -EINVAL;
+
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bb1185fff8cc..afaf59c22193 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,6 +75,7 @@ sesInfoAlloc(void)
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
+ spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
}
@@ -94,6 +95,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
+ kfree(buf_to_free->workstation_name);
kfree_sensitive(buf_to_free->auth_key.response);
kfree(buf_to_free->iface_list);
kfree_sensitive(buf_to_free);
@@ -114,7 +116,7 @@ tconInfoAlloc(void)
}
atomic_inc(&tconInfoAllocCount);
- ret_buf->tidStatus = CifsNew;
+ ret_buf->status = TID_NEW;
++ret_buf->tc_count;
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
@@ -138,9 +140,6 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
kfree(buf_to_free->nativeFileSystem);
kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->crfid.fid);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- kfree(buf_to_free->dfs_path);
-#endif
kfree(buf_to_free);
}
@@ -152,7 +151,7 @@ cifs_buf_get(void)
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- size_t buf_size = sizeof(struct smb2_sync_hdr);
+ size_t buf_size = sizeof(struct smb2_hdr);
/*
* We could use negotiated size instead of max_msgsize -
@@ -1287,69 +1286,69 @@ out:
return rc;
}
-static void tcon_super_cb(struct super_block *sb, void *arg)
-{
- struct super_cb_data *sd = arg;
- struct cifs_tcon *tcon = sd->data;
- struct cifs_sb_info *cifs_sb;
-
- if (sd->sb)
- return;
-
- cifs_sb = CIFS_SB(sb);
- if (tcon->dfs_path && cifs_sb->origin_fullpath &&
- !strcasecmp(tcon->dfs_path, cifs_sb->origin_fullpath))
- sd->sb = sb;
-}
-
-static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon)
-{
- return __cifs_get_super(tcon_super_cb, tcon);
-}
-
-static inline void cifs_put_tcon_super(struct super_block *sb)
-{
- __cifs_put_super(sb);
-}
-#else
-static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void cifs_put_tcon_super(struct super_block *sb)
+int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
{
-}
-#endif
-
-int update_super_prepath(struct cifs_tcon *tcon, char *prefix)
-{
- struct super_block *sb;
- struct cifs_sb_info *cifs_sb;
- int rc = 0;
-
- sb = cifs_get_tcon_super(tcon);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
-
- cifs_sb = CIFS_SB(sb);
-
kfree(cifs_sb->prepath);
if (prefix && *prefix) {
cifs_sb->prepath = kstrdup(prefix, GFP_ATOMIC);
- if (!cifs_sb->prepath) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!cifs_sb->prepath)
+ return -ENOMEM;
convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
} else
cifs_sb->prepath = NULL;
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ return 0;
+}
-out:
- cifs_put_tcon_super(sb);
+/** cifs_dfs_query_info_nonascii_quirk
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
+ * for "\<server>\<dfsname>\<linkpath>" DFS reference,
+ * where <dfsname> contains non-ASCII unicode symbols.
+ *
+ * Check such DFS reference and emulate -ENOENT if it is actual.
+ */
+int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *linkpath)
+{
+ char *treename, *dfspath, sep;
+ int treenamelen, linkpathlen, rc;
+
+ treename = tcon->treeName;
+ /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL
+ * messages MUST be encoded with exactly one leading backslash, not two
+ * leading backslashes.
+ */
+ sep = CIFS_DIR_SEP(cifs_sb);
+ if (treename[0] == sep && treename[1] == sep)
+ treename++;
+ linkpathlen = strlen(linkpath);
+ treenamelen = strnlen(treename, MAX_TREE_SIZE + 1);
+ dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL);
+ if (!dfspath)
+ return -ENOMEM;
+ if (treenamelen)
+ memcpy(dfspath, treename, treenamelen);
+ memcpy(dfspath + treenamelen, linkpath, linkpathlen);
+ rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), dfspath, NULL, NULL);
+ if (rc == 0) {
+ cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n",
+ dfspath);
+ rc = -EREMOTE;
+ } else if (rc == -EEXIST) {
+ cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n",
+ dfspath);
+ rc = -ENOENT;
+ } else {
+ cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc);
+ }
+ kfree(dfspath);
return rc;
}
+#endif
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index fa9fbd6a819c..235aa1b395eb 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -896,10 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr)
if (class == ERRSRV && code == ERRbaduid) {
cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n",
code);
- spin_lock(&GlobalMid_Lock);
- if (mid->server->tcpStatus != CifsExiting)
- mid->server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
+ cifs_signal_cifsd_for_reconnect(mid->server, false);
}
}
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 25a2b8ef88b9..55758b9ec877 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -40,7 +40,7 @@
#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000
#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
/* #define reserved4 0x1000000 */
-#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */
+#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we only set for SMB2+ */
/* #define reserved3 0x4000000 */
/* #define reserved2 0x8000000 */
/* #define reserved1 0x10000000 */
@@ -87,6 +87,30 @@ typedef struct _NEGOTIATE_MESSAGE {
/* followed by WorkstationString */
} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
+#define NTLMSSP_REVISION_W2K3 0x0F
+
+/* See MS-NLMP section 2.2.2.10 */
+struct ntlmssp_version {
+ __u8 ProductMajorVersion;
+ __u8 ProductMinorVersion;
+ __le16 ProductBuild; /* we send the cifs.ko module version here */
+ __u8 Reserved[3];
+ __u8 NTLMRevisionCurrent; /* currently 0x0F */
+} __packed;
+
+/* see MS-NLMP section 2.2.1.1 */
+struct negotiate_message {
+ __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+ __le32 MessageType; /* NtLmNegotiate = 1 */
+ __le32 NegotiateFlags;
+ SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */
+ SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
+ struct ntlmssp_version Version;
+ /* SECURITY_BUFFER */
+ char DomainString[];
+ /* followed by WorkstationString */
+} __packed;
+
typedef struct _CHALLENGE_MESSAGE {
__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
__le32 MessageType; /* NtLmChallenge = 2 */
@@ -119,7 +143,15 @@ typedef struct _AUTHENTICATE_MESSAGE {
*/
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses);
-void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses);
+int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ const struct nls_table *nls_cp);
+int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, u16 *buflen,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ const struct nls_table *nls_cp);
int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen,
struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct nls_table *nls_cp);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 23e02db7923f..32f478c7a66d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -17,6 +17,8 @@
#include "nterr.h"
#include <linux/utsname.h>
#include <linux/slab.h>
+#include <linux/version.h>
+#include "cifsfs.h"
#include "cifs_spnego.h"
#include "smb2proto.h"
#include "fs_context.h"
@@ -54,25 +56,84 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
{
int i;
+ spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (is_server_using_iface(ses->chans[i].server, iface))
+ if (is_server_using_iface(ses->chans[i].server, iface)) {
+ spin_unlock(&ses->chan_lock);
return true;
+ }
}
+ spin_unlock(&ses->chan_lock);
return false;
}
+/* channel helper functions. assumed that chan_lock is held by caller. */
+
+unsigned int
+cifs_ses_get_chan_index(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int i;
+
+ for (i = 0; i < ses->chan_count; i++) {
+ if (ses->chans[i].server == server)
+ return i;
+ }
+
+ /* If we didn't find the channel, it is likely a bug */
+ WARN_ON(1);
+ return 0;
+}
+
+void
+cifs_chan_set_need_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ set_bit(chan_index, &ses->chans_need_reconnect);
+ cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n",
+ chan_index, ses->chans_need_reconnect);
+}
+
+void
+cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ clear_bit(chan_index, &ses->chans_need_reconnect);
+ cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n",
+ chan_index, ses->chans_need_reconnect);
+}
+
+bool
+cifs_chan_needs_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
+}
+
/* returns number of channels added */
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
- int old_chan_count = ses->chan_count;
- int left = ses->chan_max - ses->chan_count;
+ int old_chan_count, new_chan_count;
+ int left;
int i = 0;
int rc = 0;
int tries = 0;
struct cifs_server_iface *ifaces = NULL;
size_t iface_count;
+ spin_lock(&ses->chan_lock);
+
+ new_chan_count = old_chan_count = ses->chan_count;
+ left = ses->chan_max - ses->chan_count;
+
if (left <= 0) {
+ spin_unlock(&ses->chan_lock);
cifs_dbg(FYI,
"ses already at max_channels (%zu), nothing to open\n",
ses->chan_max);
@@ -80,15 +141,18 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
if (ses->server->dialect < SMB30_PROT_ID) {
+ spin_unlock(&ses->chan_lock);
cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
return 0;
}
if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname);
ses->chan_max = 1;
+ spin_unlock(&ses->chan_lock);
+ cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname);
return 0;
}
+ spin_unlock(&ses->chan_lock);
/*
* Make a copy of the iface list at the time and use that
@@ -142,10 +206,11 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
i);
left--;
+ new_chan_count++;
}
kfree(ifaces);
- return ses->chan_count - old_chan_count;
+ return new_chan_count - old_chan_count;
}
/*
@@ -157,10 +222,14 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server)
{
int i;
+ spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (ses->chans[i].server == server)
+ if (ses->chans[i].server == server) {
+ spin_unlock(&ses->chan_lock);
return &ses->chans[i];
+ }
}
+ spin_unlock(&ses->chan_lock);
return NULL;
}
@@ -168,6 +237,7 @@ static int
cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
struct cifs_server_iface *iface)
{
+ struct TCP_Server_Info *chan_server;
struct cifs_chan *chan;
struct smb3_fs_context ctx = {NULL};
static const char unc_fmt[] = "\\%s\\foo";
@@ -204,6 +274,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* Auth */
ctx.domainauto = ses->domainAuto;
ctx.domainname = ses->domainName;
+ ctx.server_hostname = ses->server->hostname;
ctx.username = ses->user_name;
ctx.password = ses->password;
ctx.sectype = ses->sectype;
@@ -240,19 +311,26 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
SMB2_CLIENT_GUID_SIZE);
ctx.use_client_guid = true;
- mutex_lock(&ses->session_mutex);
+ chan_server = cifs_get_tcp_session(&ctx, ses->server);
- chan = ses->binding_chan = &ses->chans[ses->chan_count];
- chan->server = cifs_get_tcp_session(&ctx);
+ spin_lock(&ses->chan_lock);
+ chan = &ses->chans[ses->chan_count];
+ chan->server = chan_server;
if (IS_ERR(chan->server)) {
rc = PTR_ERR(chan->server);
chan->server = NULL;
+ spin_unlock(&ses->chan_lock);
goto out;
}
- spin_lock(&cifs_tcp_ses_lock);
- chan->server->is_channel = true;
- spin_unlock(&cifs_tcp_ses_lock);
+ ses->chan_count++;
+ atomic_set(&ses->chan_seq, 0);
+ /* Mark this channel as needing connect/setup */
+ cifs_chan_set_need_reconnect(ses, chan->server);
+
+ spin_unlock(&ses->chan_lock);
+
+ mutex_lock(&ses->session_mutex);
/*
* We need to allocate the server crypto now as we will need
* to sign packets before we generate the channel signing key
@@ -261,34 +339,29 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
rc = smb311_crypto_shash_allocate(chan->server);
if (rc) {
cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+ mutex_unlock(&ses->session_mutex);
goto out;
}
- ses->binding = true;
- rc = cifs_negotiate_protocol(xid, ses);
- if (rc)
- goto out;
+ rc = cifs_negotiate_protocol(xid, ses, chan->server);
+ if (!rc)
+ rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls);
- rc = cifs_setup_session(xid, ses, cifs_sb->local_nls);
- if (rc)
- goto out;
-
- /* success, put it on the list
- * XXX: sharing ses between 2 tcp servers is not possible, the
- * way "internal" linked lists works in linux makes element
- * only able to belong to one list
- *
- * the binding session is already established so the rest of
- * the code should be able to look it up, no need to add the
- * ses to the new server.
- */
+ mutex_unlock(&ses->session_mutex);
- ses->chan_count++;
- atomic_set(&ses->chan_seq, 0);
out:
- ses->binding = false;
- ses->binding_chan = NULL;
- mutex_unlock(&ses->session_mutex);
+ if (rc && chan->server) {
+ spin_lock(&ses->chan_lock);
+ /* we rely on all bits beyond chan_count to be clear */
+ cifs_chan_clear_need_reconnect(ses, chan->server);
+ ses->chan_count--;
+ /*
+ * chan_count should never reach 0 as at least the primary
+ * channel is always allocated
+ */
+ WARN_ON(ses->chan_count < 1);
+ spin_unlock(&ses->chan_lock);
+ }
if (rc && chan->server)
cifs_put_tcp_session(chan->server, 0);
@@ -296,7 +369,9 @@ out:
return rc;
}
-static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ SESSION_SETUP_ANDX *pSMB)
{
__u32 capabilities = 0;
@@ -309,7 +384,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
USHRT_MAX));
- pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+ pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq);
pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -320,7 +395,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
- if (ses->server->sign)
+ if (server->sign)
pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
if (ses->capabilities & CAP_UNICODE) {
@@ -554,8 +629,8 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
{
unsigned int tioffset; /* challenge message target info area */
unsigned int tilen; /* challenge message target info area length */
-
CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+ __u32 server_flags;
if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
@@ -573,12 +648,37 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
return -EINVAL;
}
+ server_flags = le32_to_cpu(pblob->NegotiateFlags);
+ cifs_dbg(FYI, "%s: negotiate=0x%08x challenge=0x%08x\n", __func__,
+ ses->ntlmssp->client_flags, server_flags);
+
+ if ((ses->ntlmssp->client_flags & (NTLMSSP_NEGOTIATE_SEAL | NTLMSSP_NEGOTIATE_SIGN)) &&
+ (!(server_flags & NTLMSSP_NEGOTIATE_56) && !(server_flags & NTLMSSP_NEGOTIATE_128))) {
+ cifs_dbg(VFS, "%s: requested signing/encryption but server did not return either 56-bit or 128-bit session key size\n",
+ __func__);
+ return -EINVAL;
+ }
+ if (!(server_flags & NTLMSSP_NEGOTIATE_NTLM) && !(server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) {
+ cifs_dbg(VFS, "%s: server does not seem to support either NTLMv1 or NTLMv2\n", __func__);
+ return -EINVAL;
+ }
+ if (ses->server->sign && !(server_flags & NTLMSSP_NEGOTIATE_SIGN)) {
+ cifs_dbg(VFS, "%s: forced packet signing but server does not seem to support it\n",
+ __func__);
+ return -EOPNOTSUPP;
+ }
+ if ((ses->ntlmssp->client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+ !(server_flags & NTLMSSP_NEGOTIATE_KEY_XCH))
+ pr_warn_once("%s: authentication has been weakened as server does not support key exchange\n",
+ __func__);
+
+ ses->ntlmssp->server_flags = server_flags;
+
memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
- /* BB we could decode pblob->NegotiateFlags; some may be useful */
/* In particular we can examine sign flags */
/* BB spec says that if AvId field of MsvAvTimestamp is populated then
we must set the MIC field of the AUTHENTICATE_MESSAGE */
- ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
if (tioffset > blob_len || tioffset + tilen > blob_len) {
@@ -599,18 +699,89 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
return 0;
}
+static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size)
+{
+ int sz = base_size + ses->auth_key.len
+ - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2;
+
+ if (ses->domainName)
+ sz += sizeof(__le16) * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ else
+ sz += sizeof(__le16);
+
+ if (ses->user_name)
+ sz += sizeof(__le16) * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
+ else
+ sz += sizeof(__le16);
+
+ if (ses->workstation_name)
+ sz += sizeof(__le16) * strnlen(ses->workstation_name,
+ CIFS_MAX_WORKSTATION_LEN);
+ else
+ sz += sizeof(__le16);
+
+ return sz;
+}
+
+static inline void cifs_security_buffer_from_str(SECURITY_BUFFER *pbuf,
+ char *str_value,
+ int str_length,
+ unsigned char *pstart,
+ unsigned char **pcur,
+ const struct nls_table *nls_cp)
+{
+ unsigned char *tmp = pstart;
+ int len;
+
+ if (!pbuf)
+ return;
+
+ if (!pcur)
+ pcur = &tmp;
+
+ if (!str_value) {
+ pbuf->BufferOffset = cpu_to_le32(*pcur - pstart);
+ pbuf->Length = 0;
+ pbuf->MaximumLength = 0;
+ *pcur += sizeof(__le16);
+ } else {
+ len = cifs_strtoUTF16((__le16 *)*pcur,
+ str_value,
+ str_length,
+ nls_cp);
+ len *= sizeof(__le16);
+ pbuf->BufferOffset = cpu_to_le32(*pcur - pstart);
+ pbuf->Length = cpu_to_le16(len);
+ pbuf->MaximumLength = cpu_to_le16(len);
+ *pcur += len;
+ }
+}
+
/* BB Move to ntlmssp.c eventually */
-/* We do not malloc the blob, it is passed in pbuffer, because
- it is fixed size, and small, making this approach cleaner */
-void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
- struct cifs_ses *ses)
+int build_ntlmssp_negotiate_blob(unsigned char **pbuffer,
+ u16 *buflen,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ const struct nls_table *nls_cp)
{
- struct TCP_Server_Info *server = cifs_ses_server(ses);
- NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
+ int rc = 0;
+ NEGOTIATE_MESSAGE *sec_blob;
__u32 flags;
+ unsigned char *tmp;
+ int len;
- memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
+ len = size_of_ntlmssp_blob(ses, sizeof(NEGOTIATE_MESSAGE));
+ *pbuffer = kmalloc(len, GFP_KERNEL);
+ if (!*pbuffer) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc);
+ *buflen = 0;
+ goto setup_ntlm_neg_ret;
+ }
+ sec_blob = (NEGOTIATE_MESSAGE *)*pbuffer;
+
+ memset(*pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmNegotiate;
@@ -618,51 +789,112 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
- NTLMSSP_NEGOTIATE_SEAL;
- if (server->sign)
- flags |= NTLMSSP_NEGOTIATE_SIGN;
+ NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL |
+ NTLMSSP_NEGOTIATE_SIGN;
if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+ tmp = *pbuffer + sizeof(NEGOTIATE_MESSAGE);
+ ses->ntlmssp->client_flags = flags;
sec_blob->NegotiateFlags = cpu_to_le32(flags);
- sec_blob->WorkstationName.BufferOffset = 0;
- sec_blob->WorkstationName.Length = 0;
- sec_blob->WorkstationName.MaximumLength = 0;
+ /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */
+ cifs_security_buffer_from_str(&sec_blob->DomainName,
+ NULL,
+ CIFS_MAX_DOMAINNAME_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
- /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
- sec_blob->DomainName.BufferOffset = 0;
- sec_blob->DomainName.Length = 0;
- sec_blob->DomainName.MaximumLength = 0;
+ cifs_security_buffer_from_str(&sec_blob->WorkstationName,
+ NULL,
+ CIFS_MAX_WORKSTATION_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ *buflen = tmp - *pbuffer;
+setup_ntlm_neg_ret:
+ return rc;
}
-static int size_of_ntlmssp_blob(struct cifs_ses *ses)
+/*
+ * Build ntlmssp blob with additional fields, such as version,
+ * supported by modern servers. For safety limit to SMB3 or later
+ * See notes in MS-NLMP Section 2.2.2.1 e.g.
+ */
+int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer,
+ u16 *buflen,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ const struct nls_table *nls_cp)
{
- int sz = sizeof(AUTHENTICATE_MESSAGE) + ses->auth_key.len
- - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2;
+ int rc = 0;
+ struct negotiate_message *sec_blob;
+ __u32 flags;
+ unsigned char *tmp;
+ int len;
- if (ses->domainName)
- sz += 2 * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
- else
- sz += 2;
+ len = size_of_ntlmssp_blob(ses, sizeof(struct negotiate_message));
+ *pbuffer = kmalloc(len, GFP_KERNEL);
+ if (!*pbuffer) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc);
+ *buflen = 0;
+ goto setup_ntlm_smb3_neg_ret;
+ }
+ sec_blob = (struct negotiate_message *)*pbuffer;
- if (ses->user_name)
- sz += 2 * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
- else
- sz += 2;
+ memset(*pbuffer, 0, sizeof(struct negotiate_message));
+ memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+ sec_blob->MessageType = NtLmNegotiate;
- return sz;
+ /* BB is NTLMV2 session security format easier to use here? */
+ flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
+ NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+ NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
+ NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL |
+ NTLMSSP_NEGOTIATE_SIGN | NTLMSSP_NEGOTIATE_VERSION;
+ if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
+ flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+
+ sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR;
+ sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL;
+ sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD);
+ sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3;
+
+ tmp = *pbuffer + sizeof(struct negotiate_message);
+ ses->ntlmssp->client_flags = flags;
+ sec_blob->NegotiateFlags = cpu_to_le32(flags);
+
+ /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */
+ cifs_security_buffer_from_str(&sec_blob->DomainName,
+ NULL,
+ CIFS_MAX_DOMAINNAME_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ cifs_security_buffer_from_str(&sec_blob->WorkstationName,
+ NULL,
+ CIFS_MAX_WORKSTATION_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ *buflen = tmp - *pbuffer;
+setup_ntlm_smb3_neg_ret:
+ return rc;
}
+
int build_ntlmssp_auth_blob(unsigned char **pbuffer,
u16 *buflen,
struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct nls_table *nls_cp)
{
int rc;
AUTHENTICATE_MESSAGE *sec_blob;
__u32 flags;
unsigned char *tmp;
+ int len;
rc = setup_ntlmv2_rsp(ses, nls_cp);
if (rc) {
@@ -670,7 +902,9 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
*buflen = 0;
goto setup_ntlmv2_ret;
}
- *pbuffer = kmalloc(size_of_ntlmssp_blob(ses), GFP_KERNEL);
+
+ len = size_of_ntlmssp_blob(ses, sizeof(AUTHENTICATE_MESSAGE));
+ *pbuffer = kmalloc(len, GFP_KERNEL);
if (!*pbuffer) {
rc = -ENOMEM;
cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc);
@@ -682,15 +916,8 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
- flags = NTLMSSP_NEGOTIATE_56 |
- NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
- NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
- NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
- NTLMSSP_NEGOTIATE_SEAL;
- if (ses->server->sign)
- flags |= NTLMSSP_NEGOTIATE_SIGN;
- if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
- flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+ flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
+ NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
@@ -719,46 +946,27 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
sec_blob->NtChallengeResponse.MaximumLength = 0;
}
- if (ses->domainName == NULL) {
- sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
- sec_blob->DomainName.Length = 0;
- sec_blob->DomainName.MaximumLength = 0;
- tmp += 2;
- } else {
- int len;
- len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
- CIFS_MAX_DOMAINNAME_LEN, nls_cp);
- len *= 2; /* unicode is 2 bytes each */
- sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
- sec_blob->DomainName.Length = cpu_to_le16(len);
- sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
- tmp += len;
- }
-
- if (ses->user_name == NULL) {
- sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
- sec_blob->UserName.Length = 0;
- sec_blob->UserName.MaximumLength = 0;
- tmp += 2;
- } else {
- int len;
- len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
- CIFS_MAX_USERNAME_LEN, nls_cp);
- len *= 2; /* unicode is 2 bytes each */
- sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
- sec_blob->UserName.Length = cpu_to_le16(len);
- sec_blob->UserName.MaximumLength = cpu_to_le16(len);
- tmp += len;
- }
-
- sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
- sec_blob->WorkstationName.Length = 0;
- sec_blob->WorkstationName.MaximumLength = 0;
- tmp += 2;
-
- if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
- (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
- && !calc_seckey(ses)) {
+ cifs_security_buffer_from_str(&sec_blob->DomainName,
+ ses->domainName,
+ CIFS_MAX_DOMAINNAME_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ cifs_security_buffer_from_str(&sec_blob->UserName,
+ ses->user_name,
+ CIFS_MAX_USERNAME_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ cifs_security_buffer_from_str(&sec_blob->WorkstationName,
+ ses->workstation_name,
+ CIFS_MAX_WORKSTATION_LEN,
+ *pbuffer, &tmp,
+ nls_cp);
+
+ if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+ (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) &&
+ !calc_seckey(ses)) {
memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -816,6 +1024,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
struct sess_data {
unsigned int xid;
struct cifs_ses *ses;
+ struct TCP_Server_Info *server;
struct nls_table *nls_cp;
void (*func)(struct sess_data *);
int result;
@@ -882,31 +1091,27 @@ static int
sess_establish_session(struct sess_data *sess_data)
{
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
- mutex_lock(&ses->server->srv_mutex);
- if (!ses->server->session_estab) {
- if (ses->server->sign) {
- ses->server->session_key.response =
+ mutex_lock(&server->srv_mutex);
+ if (!server->session_estab) {
+ if (server->sign) {
+ server->session_key.response =
kmemdup(ses->auth_key.response,
ses->auth_key.len, GFP_KERNEL);
- if (!ses->server->session_key.response) {
- mutex_unlock(&ses->server->srv_mutex);
+ if (!server->session_key.response) {
+ mutex_unlock(&server->srv_mutex);
return -ENOMEM;
}
- ses->server->session_key.len =
+ server->session_key.len =
ses->auth_key.len;
}
- ses->server->sequence_number = 0x2;
- ses->server->session_estab = true;
+ server->sequence_number = 0x2;
+ server->session_estab = true;
}
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "CIFS session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
-
return 0;
}
@@ -940,6 +1145,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
SESSION_SETUP_ANDX *pSMB;
char *bcc_ptr;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
__u32 capabilities;
__u16 bytes_remaining;
@@ -951,7 +1157,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
bcc_ptr = sess_data->iov[2].iov_base;
- capabilities = cifs_ssetup_hdr(ses, pSMB);
+ capabilities = cifs_ssetup_hdr(ses, server, pSMB);
pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
@@ -1049,6 +1255,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
SESSION_SETUP_ANDX *pSMB;
char *bcc_ptr;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
__u32 capabilities;
__u16 bytes_remaining;
struct key *spnego_key = NULL;
@@ -1063,9 +1270,9 @@ sess_auth_kerberos(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
bcc_ptr = sess_data->iov[2].iov_base;
- capabilities = cifs_ssetup_hdr(ses, pSMB);
+ capabilities = cifs_ssetup_hdr(ses, server, pSMB);
- spnego_key = cifs_get_spnego_key(ses);
+ spnego_key = cifs_get_spnego_key(ses, server);
if (IS_ERR(spnego_key)) {
rc = PTR_ERR(spnego_key);
spnego_key = NULL;
@@ -1189,12 +1396,13 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
{
SESSION_SETUP_ANDX *pSMB;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
__u32 capabilities;
char *bcc_ptr;
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- capabilities = cifs_ssetup_hdr(ses, pSMB);
+ capabilities = cifs_ssetup_hdr(ses, server, pSMB);
if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
return -ENOSYS;
@@ -1228,8 +1436,10 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
struct smb_hdr *smb_buf;
SESSION_SETUP_ANDX *pSMB;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
__u16 bytes_remaining;
char *bcc_ptr;
+ unsigned char *ntlmsspblob = NULL;
u16 blob_len;
cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
@@ -1253,14 +1463,19 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
/* Build security blob before we assemble the request */
- build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
- sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
- sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
- pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+ rc = build_ntlmssp_negotiate_blob(&ntlmsspblob,
+ &blob_len, ses, server,
+ sess_data->nls_cp);
+ if (rc)
+ goto out_free_ntlmsspblob;
+
+ sess_data->iov[1].iov_len = blob_len;
+ sess_data->iov[1].iov_base = ntlmsspblob;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
if (rc)
- goto out;
+ goto out_free_ntlmsspblob;
rc = sess_sendreceive(sess_data);
@@ -1274,14 +1489,14 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
rc = 0;
if (rc)
- goto out;
+ goto out_free_ntlmsspblob;
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
if (smb_buf->WordCount != 4) {
rc = -EIO;
cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
- goto out;
+ goto out_free_ntlmsspblob;
}
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
@@ -1295,10 +1510,13 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
cifs_dbg(VFS, "bad security blob length %d\n",
blob_len);
rc = -EINVAL;
- goto out;
+ goto out_free_ntlmsspblob;
}
rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+
+out_free_ntlmsspblob:
+ kfree(ntlmsspblob);
out:
sess_free_buffer(sess_data);
@@ -1324,6 +1542,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
struct smb_hdr *smb_buf;
SESSION_SETUP_ANDX *pSMB;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
__u16 bytes_remaining;
char *bcc_ptr;
unsigned char *ntlmsspblob = NULL;
@@ -1340,7 +1559,8 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
smb_buf = (struct smb_hdr *)pSMB;
rc = build_ntlmssp_auth_blob(&ntlmsspblob,
- &blob_len, ses, sess_data->nls_cp);
+ &blob_len, ses, server,
+ sess_data->nls_cp);
if (rc)
goto out_free_ntlmsspblob;
sess_data->iov[1].iov_len = blob_len;
@@ -1411,7 +1631,7 @@ out_free_ntlmsspblob:
out:
sess_free_buffer(sess_data);
- if (!rc)
+ if (!rc)
rc = sess_establish_session(sess_data);
/* Cleanup */
@@ -1424,11 +1644,13 @@ out:
sess_data->result = rc;
}
-static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
+static int select_sec(struct sess_data *sess_data)
{
int type;
+ struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
- type = cifs_select_sectype(ses->server, ses->sectype);
+ type = cifs_select_sectype(server, ses->sectype);
cifs_dbg(FYI, "sess setup type %d\n", type);
if (type == Unspecified) {
cifs_dbg(VFS, "Unable to select appropriate authentication method!\n");
@@ -1459,7 +1681,8 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
}
int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+ struct TCP_Server_Info *server,
+ const struct nls_table *nls_cp)
{
int rc = 0;
struct sess_data *sess_data;
@@ -1473,15 +1696,16 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
if (!sess_data)
return -ENOMEM;
- rc = select_sec(ses, sess_data);
- if (rc)
- goto out;
-
sess_data->xid = xid;
sess_data->ses = ses;
+ sess_data->server = server;
sess_data->buf0_type = CIFS_NO_BUFFER;
sess_data->nls_cp = (struct nls_table *) nls_cp;
+ rc = select_sec(sess_data);
+ if (rc)
+ goto out;
+
while (sess_data->func)
sess_data->func(sess_data);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3b83839fc2c2..c71c9a44bef4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -7,6 +7,7 @@
#include <linux/pagemap.h>
#include <linux/vfs.h>
+#include <uapi/linux/magic.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
@@ -163,7 +164,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
{
__u64 mid = 0;
__u16 last_mid, cur_mid;
- bool collision;
+ bool collision, reconnect = false;
spin_lock(&GlobalMid_Lock);
@@ -215,7 +216,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
* an eventual reconnect to clean out the pending_mid_q.
*/
if (num_mids > 32768)
- server->tcpStatus = CifsNeedReconnect;
+ reconnect = true;
if (!collision) {
mid = (__u64)cur_mid;
@@ -225,6 +226,11 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
cur_mid++;
}
spin_unlock(&GlobalMid_Lock);
+
+ if (reconnect) {
+ cifs_signal_cifsd_for_reconnect(server, false);
+ }
+
return mid;
}
@@ -414,14 +420,16 @@ cifs_need_neg(struct TCP_Server_Info *server)
}
static int
-cifs_negotiate(const unsigned int xid, struct cifs_ses *ses)
+cifs_negotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
int rc;
- rc = CIFSSMBNegotiate(xid, ses);
+ rc = CIFSSMBNegotiate(xid, ses, server);
if (rc == -EAGAIN) {
/* retry only once on 1st time connection */
- set_credits(ses->server, 1);
- rc = CIFSSMBNegotiate(xid, ses);
+ set_credits(server, 1);
+ rc = CIFSSMBNegotiate(xid, ses, server);
if (rc == -EAGAIN)
rc = -EHOSTDOWN;
}
@@ -878,7 +886,7 @@ cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
{
int rc = -EOPNOTSUPP;
- buf->f_type = CIFS_MAGIC_NUMBER;
+ buf->f_type = CIFS_SUPER_MAGIC;
/*
* We could add a second check for a QFS Unix capability bit
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index ca692b2283cd..82e916ad167c 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -13,8 +13,6 @@
#ifndef _SMB2_GLOB_H
#define _SMB2_GLOB_H
-#define SMB2_MAGIC_NUMBER 0xFE534D42
-
/*
*****************************************************************
* Constants go here
@@ -43,15 +41,4 @@
#define END_OF_CHAIN 4
#define RELATED_REQUEST 8
-#define SMB2_SIGNATURE_SIZE (16)
-#define SMB2_NTLMV2_SESSKEY_SIZE (16)
-#define SMB2_HMACSHA256_SIZE (32)
-#define SMB2_CMACAES_SIZE (16)
-#define SMB3_SIGNKEY_SIZE (16)
-#define SMB3_GCM128_CRYPTKEY_SIZE (16)
-#define SMB3_GCM256_CRYPTKEY_SIZE (32)
-
-/* Maximum buffer size value we can send with 1 credit */
-#define SMB2_MAX_BUFFER_SIZE 65536
-
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 8297703492ee..fe5bfa245fa7 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -46,6 +46,10 @@ struct cop_vars {
struct smb2_file_link_info link_info;
};
+/*
+ * note: If cfile is passed, the reference to it is dropped here.
+ * So make sure that you do not reuse cfile after return from this func.
+ */
static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -536,10 +540,11 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
+ cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_QUERY_INFO, NULL);
+ smb2_data, SMB2_OP_QUERY_INFO, cfile);
}
if (rc)
goto out;
@@ -587,10 +592,11 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
+ cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_POSIX_QUERY_INFO, NULL);
+ smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
}
if (rc)
goto out;
@@ -707,10 +713,12 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, bool set_alloc)
{
__le64 eof = cpu_to_le64(size);
+ struct cifsFileInfo *cfile;
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, NULL);
+ &eof, SMB2_OP_SET_EOF, cfile);
}
int
@@ -719,6 +727,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
int rc;
if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) &&
@@ -729,10 +739,12 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
- rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL);
+ 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 181514b8770d..194799ddd382 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status)
int
map_smb2_to_linux_error(char *buf, bool log_err)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
unsigned int i;
int rc = -EIO;
__le32 smb2err = shdr->Status;
if (smb2err == 0) {
- trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
+ trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId));
return 0;
}
@@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err)
cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
__le32_to_cpu(smb2err), rc);
- trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command),
- le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
+ trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId),
+ le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 29b5554f6263..3fe47a88f47d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -8,7 +8,6 @@
*
*/
#include <linux/ctype.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -19,7 +18,7 @@
#include "nterr.h"
static int
-check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
+check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
{
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
@@ -81,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
-#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp))
+#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp))
-static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
__u32 non_ctxlen)
{
__u16 neg_count;
@@ -135,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
__u64 mid;
__u32 clc_len; /* calculated length */
int command;
- int pdu_size = sizeof(struct smb2_sync_pdu);
- int hdr_size = sizeof(struct smb2_sync_hdr);
+ int pdu_size = sizeof(struct smb2_pdu);
+ int hdr_size = sizeof(struct smb2_hdr);
/*
* Add function to do table lookup of StructureSize by command
@@ -151,16 +150,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
struct cifs_ses *ses = NULL;
+ struct cifs_ses *iter;
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
- if (ses->Suid == thdr->SessionId)
+ list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) {
+ if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
+ ses = iter;
break;
+ }
}
spin_unlock(&cifs_tcp_ses_lock);
- if (list_entry_is_head(ses, &srvr->smb_ses_list,
- smb_ses_list)) {
+ if (!ses) {
cifs_dbg(VFS, "no decryption - session id not found\n");
return 1;
}
@@ -204,7 +205,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
- pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
+ pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
/* error packets have 9 byte structure size */
cifs_dbg(VFS, "Invalid response size %u for command %d\n",
le16_to_cpu(pdu->StructureSize2), command);
@@ -296,7 +297,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
{
*off = 0;
*len = 0;
@@ -304,7 +305,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
/* error responses do not have data area */
if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
(((struct smb2_err_rsp *)shdr)->StructureSize) ==
- SMB2_ERROR_STRUCTURE_SIZE2)
+ SMB2_ERROR_STRUCTURE_SIZE2_LE)
return NULL;
/*
@@ -401,8 +402,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
unsigned int
smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
- struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_hdr *shdr = &pdu->hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
@@ -479,11 +480,11 @@ smb2_get_lease_state(struct cifsInodeInfo *cinode)
__le32 lease = 0;
if (CIFS_CACHE_WRITE(cinode))
- lease |= SMB2_LEASE_WRITE_CACHING;
+ lease |= SMB2_LEASE_WRITE_CACHING_LE;
if (CIFS_CACHE_HANDLE(cinode))
- lease |= SMB2_LEASE_HANDLE_CACHING;
+ lease |= SMB2_LEASE_HANDLE_CACHING_LE;
if (CIFS_CACHE_READ(cinode))
- lease |= SMB2_LEASE_READ_CACHING;
+ lease |= SMB2_LEASE_READ_CACHING_LE;
return lease;
}
@@ -669,7 +670,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for oplock break\n");
- if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
+ if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
@@ -816,23 +817,23 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
int
smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = mid->resp_buf;
+ struct smb2_hdr *hdr = mid->resp_buf;
struct smb2_create_rsp *rsp = mid->resp_buf;
struct cifs_tcon *tcon;
int rc;
- if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE ||
- sync_hdr->Status != STATUS_SUCCESS)
+ if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE ||
+ hdr->Status != STATUS_SUCCESS)
return 0;
- tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
- sync_hdr->TreeId);
+ tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId),
+ le32_to_cpu(hdr->Id.SyncId.TreeId));
if (!tcon)
return -ENOENT;
rc = __smb2_handle_cancelled_cmd(tcon,
- le16_to_cpu(sync_hdr->Command),
- le64_to_cpu(sync_hdr->MessageId),
+ le16_to_cpu(hdr->Command),
+ le64_to_cpu(hdr->MessageId),
rsp->PersistentFileId,
rsp->VolatileFileId);
if (rc)
@@ -848,18 +849,19 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve
* SMB2 header.
*
* @ses: server session structure
+ * @server: pointer to server info
* @iov: array containing the SMB request we will send to the server
* @nvec: number of array entries for the iov
*/
int
-smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
+smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
+ struct kvec *iov, int nvec)
{
int i, rc;
struct sdesc *d;
- struct smb2_sync_hdr *hdr;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
+ struct smb2_hdr *hdr;
- hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
if (hdr->Command == SMB2_NEGOTIATE)
goto ok;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bda606dc72b1..d6aaeff4a30a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -13,6 +13,7 @@
#include <linux/sort.h>
#include <crypto/aead.h>
#include <linux/fiemap.h>
+#include <uapi/linux/magic.h>
#include "cifsfs.h"
#include "cifsglob.h"
#include "smb2pdu.h"
@@ -24,6 +25,7 @@
#include "smb2glob.h"
#include "cifs_ioctl.h"
#include "smbdirect.h"
+#include "fscache.h"
#include "fs_context.h"
/* Change credits for different ops and return the total number of credits */
@@ -84,6 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server,
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
pr_warn_once("server overflowed SMB3 credits\n");
+ trace_smb3_overflow_credits(server->CurrentMid,
+ server->conn_id, server->hostname, *val,
+ add, server->in_flight);
}
server->in_flight--;
if (server->in_flight == 0 &&
@@ -121,9 +126,13 @@ smb2_add_credits(struct TCP_Server_Info *server,
optype, scredits, add);
}
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedReconnect
- || server->tcpStatus == CifsExiting)
+ || server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
switch (rc) {
case -1:
@@ -208,11 +217,15 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
return rc;
spin_lock(&server->req_lock);
} else {
+ spin_unlock(&server->req_lock);
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&server->req_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
}
+ spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&server->req_lock);
scredits = server->credits;
/* can deadlock with reopen */
if (scredits <= 8) {
@@ -241,7 +254,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_wait_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -(credits->value), in_flight);
cifs_dbg(FYI, "%s: removed %u credits total=%d\n",
__func__, credits->value, scredits);
@@ -290,7 +303,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_adj_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n",
@@ -325,7 +338,7 @@ static struct mid_q_entry *
__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
{
struct mid_q_entry *mid;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -367,11 +380,11 @@ static void
smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
- shdr->ProcessId);
+ shdr->Id.SyncId.ProcessId);
cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
server->ops->calc_smb_size(buf, server));
#endif
@@ -384,14 +397,16 @@ smb2_need_neg(struct TCP_Server_Info *server)
}
static int
-smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+smb2_negotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
int rc;
spin_lock(&GlobalMid_Lock);
- cifs_ses_server(ses)->CurrentMid = 0;
+ server->CurrentMid = 0;
spin_unlock(&GlobalMid_Lock);
- rc = SMB2_negotiate(xid, ses);
+ rc = SMB2_negotiate(xid, ses, server);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
rc = -EHOSTDOWN;
@@ -888,7 +903,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->persistent_fid = o_rsp->PersistentFileId;
oparms.fid->volatile_fid = o_rsp->VolatileFileId;
#ifdef CONFIG_CIFS_DEBUG2
- oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+ oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
tcon->crfid.tcon = tcon;
@@ -1180,17 +1195,12 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb)
{
int rc;
- __le16 *utf16_path;
struct kvec rsp_iov = {NULL, 0};
int buftype = CIFS_NO_BUFFER;
struct smb2_query_info_rsp *rsp;
struct smb2_file_full_ea_info *info = NULL;
- utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, path,
FILE_READ_EA,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE,
@@ -1223,7 +1233,6 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
le32_to_cpu(rsp->OutputBufferLength), ea_name);
qeas_exit:
- kfree(utf16_path);
free_rsp_buf(buftype, rsp_iov.iov_base);
return rc;
}
@@ -1283,7 +1292,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
* the new EA. If not we should not add it since we
* would not be able to even read the EAs back.
*/
- rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, path,
FILE_READ_EA,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE,
@@ -1631,6 +1640,7 @@ smb2_ioctl_query_info(const unsigned int xid,
unsigned int size[2];
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
+ void (*free_req1_func)(struct smb_rqst *r);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1640,27 +1650,29 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
- goto e_fault;
-
+ if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) {
+ rc = -EFAULT;
+ goto free_vars;
+ }
if (qi.output_buffer_length > 1024) {
- kfree(vars);
- return -EINVAL;
+ rc = -EINVAL;
+ goto free_vars;
}
if (!ses || !server) {
- kfree(vars);
- return -EIO;
+ rc = -EIO;
+ goto free_vars;
}
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- buffer = memdup_user(arg + sizeof(struct smb_query_info),
- qi.output_buffer_length);
- if (IS_ERR(buffer)) {
- kfree(vars);
- return PTR_ERR(buffer);
+ if (qi.output_buffer_length) {
+ buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length);
+ if (IS_ERR(buffer)) {
+ rc = PTR_ERR(buffer);
+ goto free_vars;
+ }
}
/* Open */
@@ -1698,45 +1710,45 @@ smb2_ioctl_query_info(const unsigned int xid,
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, path);
if (rc)
- goto iqinf_exit;
+ goto free_output_buffer;
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
if (qi.flags & PASSTHRU_FSCTL) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->io_iov[0];
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- qi.info_type, true, buffer,
- qi.output_buffer_length,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ goto free_open_req;
}
+ rqst[1].rq_iov = &vars->io_iov[0];
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, buffer, qi.output_buffer_length,
+ CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ free_req1_func = SMB2_ioctl_free;
} else if (qi.flags == PASSTHRU_SET_INFO) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->si_iov[0];
- rqst[1].rq_nvec = 1;
-
- size[0] = 8;
- data[0] = buffer;
-
- rc = SMB2_set_info_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
+ goto free_open_req;
+ }
+ if (qi.output_buffer_length < 8) {
+ rc = -EINVAL;
+ goto free_open_req;
}
+ rqst[1].rq_iov = &vars->si_iov[0];
+ rqst[1].rq_nvec = 1;
+
+ /* MS-FSCC 2.4.13 FileEndOfFileInformation */
+ size[0] = 8;
+ data[0] = buffer;
+
+ rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ current->tgid, FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ free_req1_func = SMB2_set_info_free;
} else if (qi.flags == PASSTHRU_QUERY_INFO) {
rqst[1].rq_iov = &vars->qi_iov[0];
rqst[1].rq_nvec = 1;
@@ -1747,6 +1759,7 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ free_req1_func = SMB2_query_info_free;
} else { /* unknown flags */
cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n",
qi.flags);
@@ -1754,7 +1767,7 @@ smb2_ioctl_query_info(const unsigned int xid,
}
if (rc)
- goto iqinf_exit;
+ goto free_open_req;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1765,14 +1778,14 @@ smb2_ioctl_query_info(const unsigned int xid,
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
if (rc)
- goto iqinf_exit;
+ goto free_req_1;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
if (rc)
- goto iqinf_exit;
+ goto out;
/* No need to bump num_remote_opens since handle immediately closed */
if (qi.flags & PASSTHRU_FSCTL) {
@@ -1782,18 +1795,22 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
if (qi.input_buffer_length > 0 &&
le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length
- > rsp_iov[1].iov_len)
- goto e_fault;
+ > rsp_iov[1].iov_len) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
(const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
} else {
pqi = (struct smb_query_info __user *)arg;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
@@ -1801,28 +1818,30 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(pqi + 1, qi_rsp->Buffer,
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
}
- iqinf_exit:
- cifs_small_buf_release(rqst[0].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[1].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[2].rq_iov[0].iov_base);
+out:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- kfree(vars);
+ SMB2_close_free(&rqst[2]);
+free_req_1:
+ free_req1_func(&rqst[1]);
+free_open_req:
+ SMB2_open_free(&rqst[0]);
+free_output_buffer:
kfree(buffer);
+free_vars:
+ kfree(vars);
return rc;
-
-e_fault:
- rc = -EFAULT;
- goto iqinf_exit;
}
static ssize_t
@@ -1839,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid,
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
+ struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+ /*
+ * We need to flush all unwritten data before we can send the
+ * copychunk ioctl to the server.
+ */
+ inode = d_inode(trgtfile->dentry);
+ filemap_write_and_wait(inode->i_mapping);
+
if (pcchunk == NULL)
return -ENOMEM;
@@ -2391,7 +2418,7 @@ again:
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
+ if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
goto qdf_free;
}
@@ -2410,7 +2437,7 @@ again:
atomic_inc(&tcon->num_remote_opens);
qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
- if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, fid->persistent_fid,
tcon->tid, tcon->ses->Suid, 0, 0);
srch_inf->endOfSearch = true;
@@ -2462,7 +2489,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
int scredits, in_flight;
if (shdr->Status != STATUS_PENDING)
@@ -2476,7 +2503,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_pend_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n",
@@ -2489,13 +2516,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
static bool
smb2_is_session_expired(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
shdr->Status != STATUS_USER_SESSION_DELETED)
return false;
- trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId,
+ trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
le16_to_cpu(shdr->Command),
le64_to_cpu(shdr->MessageId));
cifs_dbg(FYI, "Session expired or deleted\n");
@@ -2506,7 +2534,7 @@ smb2_is_session_expired(char *buf)
static bool
smb2_is_status_io_timeout(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status == STATUS_IO_TIMEOUT)
return true;
@@ -2517,7 +2545,7 @@ smb2_is_status_io_timeout(char *buf)
static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct list_head *tmp, *tmp1;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -2530,7 +2558,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- if (tcon->tid == shdr->TreeId) {
+ if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
tcon->need_reconnect = true;
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
@@ -2558,9 +2586,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
void
smb2_set_related(struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
return;
@@ -2573,13 +2601,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
void
smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = ses->server;
unsigned long len = smb_rqst_len(server, rqst);
int i, num_padding;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
return;
@@ -2633,7 +2661,7 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
*/
int
smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
- __le16 *utf16_path, u32 desired_access,
+ const char *path, u32 desired_access,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb)
@@ -2651,6 +2679,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
int rc;
+ __le16 *utf16_path;
+ struct cached_fid *cfid = NULL;
+
+ if (!path)
+ path = "";
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -2659,6 +2695,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid);
+
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
@@ -2680,15 +2718,29 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = qi_iov;
rqst[1].rq_nvec = 1;
- rc = SMB2_query_info_init(tcon, server,
- &rqst[1], COMPOUND_FID, COMPOUND_FID,
- class, type, 0,
- output_len, 0,
- NULL);
+ if (cfid) {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1],
+ cfid->fid->persistent_fid,
+ cfid->fid->volatile_fid,
+ class, type, 0,
+ output_len, 0,
+ NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ class, type, 0,
+ output_len, 0,
+ NULL);
+ }
if (rc)
goto qic_exit;
- smb2_set_next_command(tcon, &rqst[1]);
- smb2_set_related(&rqst[1]);
+ if (!cfid) {
+ smb2_set_next_command(tcon, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+ }
memset(&close_iov, 0, sizeof(close_iov));
rqst[2].rq_iov = close_iov;
@@ -2700,9 +2752,15 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_related(&rqst[2]);
- rc = compound_send_recv(xid, ses, server,
- flags, 3, rqst,
- resp_buftype, rsp_iov);
+ if (cfid) {
+ rc = compound_send_recv(xid, ses, server,
+ flags, 1, &rqst[1],
+ &resp_buftype[1], &rsp_iov[1]);
+ } else {
+ rc = compound_send_recv(xid, ses, server,
+ flags, 3, rqst,
+ resp_buftype, rsp_iov);
+ }
if (rc) {
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
if (rc == -EREMCHG) {
@@ -2716,11 +2774,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
*buftype = resp_buftype[1];
qic_exit:
+ kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ if (cfid)
+ close_cached_dir(cfid);
return rc;
}
@@ -2730,13 +2791,12 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_query_info_rsp *rsp;
struct smb2_fs_full_size_info *info = NULL;
- __le16 utf16_path = 0; /* Null - open root of share */
struct kvec rsp_iov = {NULL, 0};
int buftype = CIFS_NO_BUFFER;
int rc;
- rc = smb2_query_info_compound(xid, tcon, &utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, "",
FILE_READ_ATTRIBUTES,
FS_FULL_SIZE_INFORMATION,
SMB2_O_INFO_FILESYSTEM,
@@ -2746,7 +2806,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
goto qfs_exit;
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
- buf->f_type = SMB2_MAGIC_NUMBER;
+ buf->f_type = SMB2_SUPER_MAGIC;
info = (struct smb2_fs_full_size_info *)(
le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
@@ -2788,7 +2848,7 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB311_posix_qfs_info(xid, tcon, fid.persistent_fid,
fid.volatile_fid, buf);
- buf->f_type = SMB2_MAGIC_NUMBER;
+ buf->f_type = SMB2_SUPER_MAGIC;
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
return rc;
}
@@ -2843,6 +2903,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
struct fsctl_get_dfs_referral_req *dfs_req = NULL;
struct get_dfs_referral_rsp *dfs_rsp = NULL;
u32 dfs_req_size = 0, dfs_rsp_size = 0;
+ int retry_count = 0;
cifs_dbg(FYI, "%s: path: %s\n", __func__, search_name);
@@ -2894,11 +2955,14 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
true /* is_fsctl */,
(char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
- } while (rc == -EAGAIN);
+ if (!is_retryable_error(rc))
+ break;
+ usleep_range(512, 2048);
+ } while (++retry_count < 5);
if (rc) {
- if ((rc != -ENOENT) && (rc != -EOPNOTSUPP))
- cifs_tcon_dbg(VFS, "ioctl error in %s rc=%d\n", __func__, rc);
+ if (!is_retryable_error(rc) && rc != -ENOENT && rc != -EOPNOTSUPP)
+ cifs_tcon_dbg(VFS, "%s: ioctl error: rc=%d\n", __func__, rc);
goto out;
}
@@ -3124,7 +3188,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype, rsp_iov);
create_rsp = rsp_iov[0].iov_base;
- if (create_rsp && create_rsp->sync_hdr.Status)
+ if (create_rsp && create_rsp->hdr.Status)
err_iov = rsp_iov[0];
ioctl_rsp = rsp_iov[1].iov_base;
@@ -3871,29 +3935,38 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
+ struct inode *inode;
struct cifsFileInfo *cfile = file->private_data;
+ struct cifsInodeInfo *cifsi;
__le64 eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode) ||
- off + len >= i_size_read(file->f_inode)) {
+ inode = d_inode(cfile->dentry);
+ cifsi = CIFS_I(inode);
+
+ if (off >= i_size_read(inode) ||
+ off + len >= i_size_read(inode)) {
rc = -EINVAL;
goto out;
}
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(file->f_inode) - off - len, off);
+ i_size_read(inode) - off - len, off);
if (rc < 0)
goto out;
- eof = cpu_to_le64(i_size_read(file->f_inode) - len);
+ eof = cpu_to_le64(i_size_read(inode) - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
goto out;
rc = 0;
+
+ cifsi->server_eof = i_size_read(inode) - len;
+ truncate_setsize(inode, cifsi->server_eof);
+ fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
out:
free_xid(xid);
return rc;
@@ -4267,12 +4340,12 @@ static __le32
map_oplock_to_lease(u8 oplock)
{
if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
- return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
+ return SMB2_LEASE_WRITE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE;
else if (oplock == SMB2_OPLOCK_LEVEL_II)
- return SMB2_LEASE_READ_CACHING;
+ return SMB2_LEASE_READ_CACHING_LE;
else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
- return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
- SMB2_LEASE_WRITE_CACHING;
+ return SMB2_LEASE_HANDLE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE;
return 0;
}
@@ -4334,7 +4407,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
struct create_lease *lc = (struct create_lease *)buf;
*epoch = 0; /* not used */
- if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+ if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
return le32_to_cpu(lc->lcontext.LeaseState);
}
@@ -4345,7 +4418,7 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
*epoch = le16_to_cpu(lc->lcontext.Epoch);
- if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+ if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
if (lease_key)
memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
@@ -4369,8 +4442,8 @@ static void
fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
struct smb_rqst *old_rq, __le16 cipher_type)
{
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)old_rq->rq_iov[0].iov_base;
memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -4496,7 +4569,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
+ rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
enc ? "en" : "de");
@@ -4788,7 +4861,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_page_idx;
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct bio_vec *bvec = NULL;
struct iov_iter iter;
struct kvec iov;
@@ -4803,7 +4876,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
if (!is_offloaded)
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -1;
}
@@ -4976,10 +5049,12 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->callback(mid);
} else {
+ spin_lock(&cifs_tcp_ses_lock);
spin_lock(&GlobalMid_Lock);
if (dw->server->tcpStatus == CifsNeedReconnect) {
mid->mid_state = MID_RETRY_NEEDED;
spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
mid->callback(mid);
} else {
mid->mid_state = MID_REQUEST_SUBMITTED;
@@ -4987,6 +5062,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
list_add_tail(&mid->qhead,
&dw->server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_tcp_ses_lock);
}
}
cifs_mid_q_entry_release(mid);
@@ -5117,7 +5193,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
{
int ret, length;
char *buf = server->smallbuf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
unsigned int pdu_length = server->pdu_size;
unsigned int buf_size;
struct mid_q_entry *mid_entry;
@@ -5147,7 +5223,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
next_is_large = server->large_buf;
one_more:
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (shdr->NextCommand) {
if (next_is_large)
next_buffer = (char *)cifs_buf_get();
@@ -5213,16 +5289,16 @@ smb3_receive_transform(struct TCP_Server_Info *server,
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
if (pdu_length < sizeof(struct smb2_transform_hdr) +
- sizeof(struct smb2_sync_hdr)) {
+ sizeof(struct smb2_hdr)) {
cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -ECONNABORTED;
}
if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_server_dbg(VFS, "Transform message is broken\n");
- cifs_reconnect(server);
+ cifs_reconnect(server, true);
return -ECONNABORTED;
}
@@ -5246,7 +5322,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static int
smb2_next_header(char *buf)
{
- struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
@@ -5785,10 +5861,10 @@ struct smb_version_values smb20_values = {
.protocol_id = SMB20_PROT_ID,
.req_capabilities = 0, /* MBZ */
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5806,10 +5882,10 @@ struct smb_version_values smb21_values = {
.protocol_id = SMB21_PROT_ID,
.req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5827,10 +5903,10 @@ struct smb_version_values smb3any_values = {
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5848,10 +5924,10 @@ struct smb_version_values smbdefault_values = {
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5869,10 +5945,10 @@ struct smb_version_values smb30_values = {
.protocol_id = SMB30_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5890,10 +5966,10 @@ struct smb_version_values smb302_values = {
.protocol_id = SMB302_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5911,10 +5987,10 @@ struct smb_version_values smb311_values = {
.protocol_id = SMB311_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 7829c590eeac..1b7ad0c09566 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -23,7 +23,6 @@
#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -84,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
}
static void
-smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
+smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
const struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
@@ -104,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
} else {
shdr->CreditRequest = cpu_to_le16(2);
}
- shdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+ shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid);
if (!tcon)
goto out;
@@ -115,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
shdr->CreditCharge = cpu_to_le16(1);
/* else CreditCharge MBZ */
- shdr->TreeId = tcon->tid;
+ shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid);
/* Uid is not converted */
if (tcon->ses)
- shdr->SessionId = tcon->ses->Suid;
+ shdr->SessionId = cpu_to_le64(tcon->ses->Suid);
/*
* If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
@@ -143,7 +142,7 @@ static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
- int rc;
+ int rc = 0;
struct nls_table *nls_codepage;
struct cifs_ses *ses;
int retries;
@@ -156,10 +155,15 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if (tcon == NULL)
return 0;
- if (smb2_command == SMB2_TREE_CONNECT)
+ /*
+ * Need to also skip SMB2_IOCTL because it is used for checking nested dfs links in
+ * cifs_tree_connect().
+ */
+ if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
return 0;
- if (tcon->tidStatus == CifsExiting) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->status == TID_EXITING) {
/*
* only tree disconnect, open, and write,
* (and ulogoff which does not have tcon)
@@ -168,11 +172,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if ((smb2_command != SMB2_WRITE) &&
(smb2_command != SMB2_CREATE) &&
(smb2_command != SMB2_TREE_DISCONNECT)) {
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb2_command);
return -ENODEV;
}
}
+ spin_unlock(&cifs_tcp_ses_lock);
if ((!tcon->ses) || (tcon->ses->status == CifsExiting) ||
(!tcon->ses->server) || !server)
return -EIO;
@@ -211,8 +217,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
}
/* are we still trying to reconnect? */
- if (server->tcpStatus != CifsNeedReconnect)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
break;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
if (retries && --retries)
continue;
@@ -229,64 +239,74 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
retries = server->nr_targets;
}
- if (!tcon->ses->need_reconnect && !tcon->need_reconnect)
+ spin_lock(&ses->chan_lock);
+ if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) {
+ spin_unlock(&ses->chan_lock);
return 0;
+ }
+ spin_unlock(&ses->chan_lock);
+ cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d",
+ tcon->ses->chans_need_reconnect,
+ tcon->need_reconnect);
nls_codepage = load_nls_default();
/*
- * need to prevent multiple threads trying to simultaneously reconnect
- * the same SMB session
- */
- mutex_lock(&tcon->ses->session_mutex);
-
- /*
* Recheck after acquire mutex. If another thread is negotiating
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedReconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
rc = -EHOSTDOWN;
- mutex_unlock(&tcon->ses->session_mutex);
goto out;
}
+ spin_unlock(&cifs_tcp_ses_lock);
/*
- * If we are reconnecting an extra channel, bind
+ * need to prevent multiple threads trying to simultaneously
+ * reconnect the same SMB session
*/
- if (server->is_channel) {
- ses->binding = true;
- ses->binding_chan = cifs_ses_find_chan(ses, server);
+ spin_lock(&ses->chan_lock);
+ if (!cifs_chan_needs_reconnect(ses, server)) {
+ spin_unlock(&ses->chan_lock);
+
+ /* this means that we only need to tree connect */
+ if (tcon->need_reconnect)
+ goto skip_sess_setup;
+
+ goto out;
}
+ spin_unlock(&ses->chan_lock);
- rc = cifs_negotiate_protocol(0, tcon->ses);
- if (!rc && tcon->ses->need_reconnect) {
- rc = cifs_setup_session(0, tcon->ses, nls_codepage);
+ mutex_lock(&ses->session_mutex);
+ rc = cifs_negotiate_protocol(0, ses, server);
+ if (!rc) {
+ rc = cifs_setup_session(0, ses, server, nls_codepage);
if ((rc == -EACCES) && !tcon->retry) {
+ mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
- ses->binding = false;
- ses->binding_chan = NULL;
- mutex_unlock(&tcon->ses->session_mutex);
goto failed;
}
+ } else {
+ mutex_unlock(&ses->session_mutex);
+ goto out;
}
- /*
- * End of channel binding
- */
- ses->binding = false;
- ses->binding_chan = NULL;
+ mutex_unlock(&ses->session_mutex);
- if (rc || !tcon->need_reconnect) {
- mutex_unlock(&tcon->ses->session_mutex);
+skip_sess_setup:
+ mutex_lock(&ses->session_mutex);
+ if (!tcon->need_reconnect) {
+ mutex_unlock(&ses->session_mutex);
goto out;
}
-
cifs_mark_open_files_invalid(tcon);
if (tcon->use_persistent)
tcon->need_reopen_files = true;
rc = cifs_tree_connect(0, tcon, nls_codepage);
- mutex_unlock(&tcon->ses->session_mutex);
+ mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc) {
@@ -331,7 +351,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
void *buf,
unsigned int *total_len)
{
- struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf;
+ struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
/* lookup word count ie StructureSize from table */
__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
@@ -341,10 +361,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
*/
memset(buf, 0, 256);
- smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server);
+ smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server);
spdu->StructureSize2 = cpu_to_le16(parmsize);
- *total_len = parmsize + sizeof(struct smb2_sync_hdr);
+ *total_len = parmsize + sizeof(struct smb2_hdr);
}
/*
@@ -367,7 +387,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
}
fill_small_buf(smb2_command, tcon, server,
- (struct smb2_sync_hdr *)(*request_buf),
+ (struct smb2_hdr *)(*request_buf),
total_len);
if (tcon != NULL) {
@@ -414,8 +434,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
pneg_ctxt->DataLength = cpu_to_le16(38);
pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE);
- get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
}
@@ -830,7 +850,9 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
*/
int
-SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+SMB2_negotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
struct smb_rqst rqst;
struct smb2_negotiate_req *req;
@@ -839,7 +861,6 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
struct kvec rsp_iov;
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
@@ -857,7 +878,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if (rc)
return rc;
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
@@ -1018,7 +1039,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
- (struct smb2_sync_hdr *)rsp);
+ (struct smb2_hdr *)rsp);
/*
* See MS-SMB2 section 2.2.4: if no blob, client picks default which
* for us will be
@@ -1218,6 +1239,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
struct SMB2_sess_data {
unsigned int xid;
struct cifs_ses *ses;
+ struct TCP_Server_Info *server;
struct nls_table *nls_cp;
void (*func)(struct SMB2_sess_data *);
int result;
@@ -1239,9 +1261,10 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
{
int rc;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
struct smb2_sess_setup_req *req;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
unsigned int total_len;
+ bool is_binding = false;
rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, server,
(void **) &req,
@@ -1249,24 +1272,31 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
if (rc)
return rc;
- if (sess_data->ses->binding) {
- req->sync_hdr.SessionId = sess_data->ses->Suid;
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ spin_unlock(&ses->chan_lock);
+
+ if (is_binding) {
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
req->PreviousSessionId = 0;
req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
+ cifs_dbg(FYI, "Binding to sess id: %llx\n", ses->Suid);
} else {
/* First session, not a reauthenticate */
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
/*
* if reconnect, we need to send previous sess id
* otherwise it is 0
*/
- req->PreviousSessionId = sess_data->previous_session;
+ req->PreviousSessionId = cpu_to_le64(sess_data->previous_session);
req->Flags = 0; /* MBZ */
+ cifs_dbg(FYI, "Fresh session. Previous: %llx\n",
+ sess_data->previous_session);
}
/* enough to enable echos and oplocks and one max size write */
- req->sync_hdr.CreditRequest = cpu_to_le16(130);
+ req->hdr.CreditRequest = cpu_to_le16(130);
/* only one of SMB2 signing flags may be set in SMB2 request */
if (server->sign)
@@ -1322,7 +1352,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
/* BB add code to build os and lm fields */
rc = cifs_send_recv(sess_data->xid, sess_data->ses,
- cifs_ses_server(sess_data->ses),
+ sess_data->server,
&rqst,
&sess_data->buf0_type,
CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov);
@@ -1337,11 +1367,11 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
{
int rc = 0;
struct cifs_ses *ses = sess_data->ses;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
+ struct TCP_Server_Info *server = sess_data->server;
mutex_lock(&server->srv_mutex);
if (server->ops->generate_signingkey) {
- rc = server->ops->generate_signingkey(ses);
+ rc = server->ops->generate_signingkey(ses, server);
if (rc) {
cifs_dbg(FYI,
"SMB3 session key generation failed\n");
@@ -1356,14 +1386,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "SMB2/3 session established successfully\n");
- /* keep existing ses state if binding */
- if (!ses->binding) {
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
- }
-
return rc;
}
@@ -1373,15 +1395,17 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
{
int rc;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
struct cifs_spnego_msg *msg;
struct key *spnego_key = NULL;
struct smb2_sess_setup_rsp *rsp = NULL;
+ bool is_binding = false;
rc = SMB2_sess_alloc_buffer(sess_data);
if (rc)
goto out;
- spnego_key = cifs_get_spnego_key(ses);
+ spnego_key = cifs_get_spnego_key(ses, server);
if (IS_ERR(spnego_key)) {
rc = PTR_ERR(spnego_key);
if (rc == -ENOKEY)
@@ -1402,8 +1426,12 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
}
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ spin_unlock(&ses->chan_lock);
+
/* keep session key if binding */
- if (!ses->binding) {
+ if (!is_binding) {
ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -1424,8 +1452,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
/* keep session id and flags if binding */
- if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ if (!is_binding) {
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1456,10 +1484,12 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
{
int rc;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
struct smb2_sess_setup_rsp *rsp = NULL;
- char *ntlmssp_blob = NULL;
+ unsigned char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
u16 blob_length = 0;
+ bool is_binding = false;
/*
* If memory allocation is successful, caller of this function
@@ -1476,22 +1506,17 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
if (rc)
goto out_err;
- ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
- GFP_KERNEL);
- if (ntlmssp_blob == NULL) {
- rc = -ENOMEM;
- goto out;
- }
+ rc = build_ntlmssp_smb3_negotiate_blob(&ntlmssp_blob,
+ &blob_length, ses, server,
+ sess_data->nls_cp);
+ if (rc)
+ goto out_err;
- build_ntlmssp_negotiate_blob(ntlmssp_blob, ses);
if (use_spnego) {
/* BB eventually need to add this */
cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
rc = -EOPNOTSUPP;
goto out;
- } else {
- blob_length = sizeof(struct _NEGOTIATE_MESSAGE);
- /* with raw NTLMSSP we don't encapsulate in SPNEGO */
}
sess_data->iov[1].iov_base = ntlmssp_blob;
sess_data->iov[1].iov_len = blob_length;
@@ -1501,7 +1526,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* If true, rc here is expected and not an error */
if (sess_data->buf0_type != CIFS_NO_BUFFER &&
- rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
rc = 0;
if (rc)
@@ -1521,9 +1546,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ spin_unlock(&ses->chan_lock);
+
/* keep existing ses id and flags if binding */
- if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ if (!is_binding) {
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1547,21 +1576,24 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
{
int rc;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
struct smb2_sess_setup_req *req;
struct smb2_sess_setup_rsp *rsp = NULL;
unsigned char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
u16 blob_length = 0;
+ bool is_binding = false;
rc = SMB2_sess_alloc_buffer(sess_data);
if (rc)
goto out;
req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
- rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
- sess_data->nls_cp);
+ rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length,
+ ses, server,
+ sess_data->nls_cp);
if (rc) {
cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc);
goto out;
@@ -1582,9 +1614,13 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ spin_unlock(&ses->chan_lock);
+
/* keep existing ses id and flags if binding */
- if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ if (!is_binding) {
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1614,11 +1650,13 @@ out:
}
static int
-SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data)
+SMB2_select_sec(struct SMB2_sess_data *sess_data)
{
int type;
+ struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = sess_data->server;
- type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype);
+ type = smb2_select_sectype(server, ses->sectype);
cifs_dbg(FYI, "sess setup type %d\n", type);
if (type == Unspecified) {
cifs_dbg(VFS, "Unable to select appropriate authentication method!\n");
@@ -1642,10 +1680,10 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data)
int
SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct nls_table *nls_cp)
{
int rc = 0;
- struct TCP_Server_Info *server = cifs_ses_server(ses);
struct SMB2_sess_data *sess_data;
cifs_dbg(FYI, "Session Setup\n");
@@ -1659,15 +1697,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
if (!sess_data)
return -ENOMEM;
- rc = SMB2_select_sec(ses, sess_data);
- if (rc)
- goto out;
sess_data->xid = xid;
sess_data->ses = ses;
+ sess_data->server = server;
sess_data->buf0_type = CIFS_NO_BUFFER;
sess_data->nls_cp = (struct nls_table *) nls_cp;
sess_data->previous_session = ses->Suid;
+ rc = SMB2_select_sec(sess_data);
+ if (rc)
+ goto out;
+
/*
* Initialize the session hash with the server one.
*/
@@ -1706,8 +1746,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
return -EIO;
/* no need to send SMB logoff if uid already closed due to reconnect */
- if (ses->need_reconnect)
+ spin_lock(&ses->chan_lock);
+ if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) {
+ spin_unlock(&ses->chan_lock);
goto smb2_session_already_dead;
+ }
+ spin_unlock(&ses->chan_lock);
rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, ses->server,
(void **) &req, &total_len);
@@ -1715,12 +1759,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
return rc;
/* since no tcon, smb2_init can not do this, so do here */
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
flags |= CIFS_TRANSFORM_REQ;
else if (server->sign)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
flags |= CIFS_NO_RSP_BUF;
@@ -1828,21 +1872,21 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
!(ses->session_flags &
(SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
((ses->user_name != NULL) || (ses->sectype == Kerberos)))
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
/* Need 64 for max size write so ask for more in case not there yet */
- req->sync_hdr.CreditRequest = cpu_to_le16(64);
+ req->hdr.CreditRequest = cpu_to_le16(64);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc);
- if (rc != 0) {
+ if ((rc != 0) || (rsp == NULL)) {
cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
tcon->need_reconnect = true;
goto tcon_error_exit;
@@ -1869,9 +1913,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
- tcon->tidStatus = CifsGood;
- tcon->need_reconnect = false;
- tcon->tid = rsp->sync_hdr.TreeId;
+ tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1892,9 +1934,8 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
- }
goto tcon_exit;
}
@@ -1916,8 +1957,13 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if (!ses || !(ses->server))
return -EIO;
- if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
+ spin_lock(&ses->chan_lock);
+ if ((tcon->need_reconnect) ||
+ (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) {
+ spin_unlock(&ses->chan_lock);
return 0;
+ }
+ spin_unlock(&ses->chan_lock);
close_cached_dir_lease(&tcon->crfid);
@@ -2530,8 +2576,13 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
cp = load_nls_default();
cifs_strtoUTF16(*out_path, treename, treename_len, cp);
- UniStrcat(*out_path, sep);
- UniStrcat(*out_path, path);
+
+ /* Do not append the separator if the path is empty */
+ if (path[0] != cpu_to_le16(0x0000)) {
+ UniStrcat(*out_path, sep);
+ UniStrcat(*out_path, path);
+ }
+
unload_nls(cp);
return 0;
@@ -2608,7 +2659,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, utf16_path);
@@ -2671,10 +2722,20 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
goto err_free_rsp_buf;
}
+ /*
+ * Although unlikely to be possible for rsp to be null and rc not set,
+ * adding check below is slightly safer long term (and quiets Coverity
+ * warning)
+ */
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
- trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid,
- ses->Suid, CREATE_NOT_FILE,
- FILE_WRITE_ATTRIBUTES);
+ if (rsp == NULL) {
+ rc = -EIO;
+ kfree(pc_buf);
+ goto err_free_req;
+ }
+
+ trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid,
+ CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES);
SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId);
@@ -2740,7 +2801,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, path);
@@ -2942,17 +3003,18 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
tcon->need_reconnect = true;
}
goto creat_exit;
- } else
- trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
- ses->Suid, oparms->create_options,
- oparms->desired_access);
+ } else if (rsp == NULL) /* unlikely to happen, but safer to check */
+ goto creat_exit;
+ else
+ trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid,
+ oparms->create_options, oparms->desired_access);
atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
- oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+ oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
if (buf) {
@@ -3052,7 +3114,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
- req->sync_hdr.CreditCharge =
+ req->hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
@@ -3062,7 +3124,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
return 0;
}
@@ -3162,6 +3224,16 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if ((plen == NULL) || (out_data == NULL))
goto ioctl_exit;
+ /*
+ * Although unlikely to be possible for rsp to be null and rc not set,
+ * adding check below is slightly safer long term (and quiets Coverity
+ * warning)
+ */
+ if (rsp == NULL) {
+ rc = -EIO;
+ goto ioctl_exit;
+ }
+
*plen = le32_to_cpu(rsp->OutputCount);
/* We check for obvious errors in the output buffer length and offset */
@@ -3687,7 +3759,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED
|| mid->mid_state == MID_RESPONSE_MALFORMED) {
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
}
@@ -3699,27 +3771,35 @@ void smb2_reconnect_server(struct work_struct *work)
{
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, reconnect.work);
- struct cifs_ses *ses;
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses, *ses2;
struct cifs_tcon *tcon, *tcon2;
- struct list_head tmp_list;
- int tcon_exist = false;
+ struct list_head tmp_list, tmp_ses_list;
+ bool tcon_exist = false, ses_exist = false;
+ bool tcon_selected = false;
int rc;
- int resched = false;
+ bool resched = false;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
- mutex_lock(&server->reconnect_mutex);
+ mutex_lock(&pserver->reconnect_mutex);
INIT_LIST_HEAD(&tmp_list);
- cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
+ INIT_LIST_HEAD(&tmp_ses_list);
+ cifs_dbg(FYI, "Reconnecting tcons and channels\n");
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+
+ tcon_selected = false;
+
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
list_add_tail(&tcon->rlist, &tmp_list);
- tcon_exist = true;
+ tcon_selected = tcon_exist = true;
}
}
/*
@@ -3728,15 +3808,27 @@ void smb2_reconnect_server(struct work_struct *work)
*/
if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
- tcon_exist = true;
+ tcon_selected = tcon_exist = true;
+ ses->ses_count++;
+ }
+ /*
+ * handle the case where channel needs to reconnect
+ * binding session, but tcon is healthy (some other channel
+ * is active)
+ */
+ spin_lock(&ses->chan_lock);
+ if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) {
+ list_add_tail(&ses->rlist, &tmp_ses_list);
+ ses_exist = true;
ses->ses_count++;
}
+ spin_unlock(&ses->chan_lock);
}
/*
* Get the reference to server struct to be sure that the last call of
* cifs_put_tcon() in the loop below won't release the server pointer.
*/
- if (tcon_exist)
+ if (tcon_exist || ses_exist)
server->srv_count++;
spin_unlock(&cifs_tcp_ses_lock);
@@ -3754,13 +3846,43 @@ void smb2_reconnect_server(struct work_struct *work)
cifs_put_tcon(tcon);
}
- cifs_dbg(FYI, "Reconnecting tcons finished\n");
+ if (!ses_exist)
+ goto done;
+
+ /* allocate a dummy tcon struct used for reconnect */
+ tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
+ if (!tcon) {
+ resched = true;
+ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
+ list_del_init(&ses->rlist);
+ cifs_put_smb_ses(ses);
+ }
+ goto done;
+ }
+
+ tcon->status = TID_GOOD;
+ tcon->retry = false;
+ tcon->need_reconnect = false;
+
+ /* now reconnect sessions for necessary channels */
+ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
+ tcon->ses = ses;
+ rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server);
+ if (rc)
+ resched = true;
+ list_del_init(&ses->rlist);
+ cifs_put_smb_ses(ses);
+ }
+ kfree(tcon);
+
+done:
+ cifs_dbg(FYI, "Reconnecting tcons and channels finished\n");
if (resched)
queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ);
- mutex_unlock(&server->reconnect_mutex);
+ mutex_unlock(&pserver->reconnect_mutex);
/* now we can safely release srv struct */
- if (tcon_exist)
+ if (tcon_exist || ses_exist)
cifs_put_tcp_session(server, 1);
}
@@ -3774,20 +3896,23 @@ SMB2_echo(struct TCP_Server_Info *server)
.rq_nvec = 1 };
unsigned int total_len;
- cifs_dbg(FYI, "In echo request\n");
+ cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id);
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedNegotiate) {
+ spin_unlock(&cifs_tcp_ses_lock);
/* No need to send echo on newly established connections */
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
return rc;
}
+ spin_unlock(&cifs_tcp_ses_lock);
rc = smb2_plain_req_init(SMB2_ECHO, NULL, server,
(void **)&req, &total_len);
if (rc)
return rc;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
iov[0].iov_len = total_len;
iov[0].iov_base = (char *)req;
@@ -3890,8 +4015,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
- struct smb2_read_plain_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_read_req *req = NULL;
+ struct smb2_hdr *shdr;
struct TCP_Server_Info *server = io_parms->server;
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server,
@@ -3902,8 +4027,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (server == NULL)
return -ECONNABORTED;
- shdr = &req->sync_hdr;
- shdr->ProcessId = cpu_to_le32(io_parms->pid);
+ shdr = &req->hdr;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
req->PersistentFileId = io_parms->persistent_fid;
req->VolatileFileId = io_parms->volatile_fid;
@@ -3940,7 +4065,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (need_invalidate)
req->Channel = SMB2_CHANNEL_RDMA_V1;
req->ReadChannelInfoOffset =
- cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer));
+ cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
req->ReadChannelInfoLength =
cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
@@ -3964,10 +4089,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
- shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
+ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF);
+ req->PersistentFileId = (u64)-1;
+ req->VolatileFileId = (u64)-1;
}
}
if (remaining_bytes > io_parms->length)
@@ -3985,8 +4110,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct cifs_readdata *rdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
.rq_nvec = 1,
@@ -4072,7 +4197,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
{
int rc, flags = 0;
char *buf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1 };
@@ -4105,7 +4230,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
rdata->iov[0].iov_base = buf;
rdata->iov[0].iov_len = total_len;
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
@@ -4144,7 +4269,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
{
struct smb_rqst rqst;
int resp_buftype, rc;
- struct smb2_read_plain_req *req = NULL;
+ struct smb2_read_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
struct kvec iov[1];
struct kvec rsp_iov;
@@ -4178,19 +4303,20 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- trace_smb3_read_err(xid, req->PersistentFileId,
+ trace_smb3_read_err(xid,
+ req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, 0);
+ trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid,
+ ses->Suid, io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
+ trace_smb3_read_done(xid,
+ req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
@@ -4238,7 +4364,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
@@ -4264,7 +4390,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
fallthrough;
default:
@@ -4311,7 +4437,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
{
int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct kvec iov[1];
@@ -4329,8 +4455,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- shdr = (struct smb2_sync_hdr *)req;
- shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
+ shdr = (struct smb2_hdr *)req;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
req->PersistentFileId = wdata->cfile->fid.persistent_fid;
req->VolatileFileId = wdata->cfile->fid.volatile_fid;
@@ -4430,7 +4556,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata, flags, &wdata->credits);
if (rc) {
- trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+ trace_smb3_write_err(0 /* no xid */,
+ req->PersistentFileId,
tcon->tid, tcon->ses->Suid, wdata->offset,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
@@ -4481,7 +4608,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
req->PersistentFileId = io_parms->persistent_fid;
req->VolatileFileId = io_parms->volatile_fid;
@@ -4512,7 +4639,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
- trace_smb3_write_err(xid, req->PersistentFileId,
+ trace_smb3_write_err(xid,
+ req->PersistentFileId,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset, io_parms->length, rc);
@@ -4520,10 +4648,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_dbg(VFS, "Send error in write = %d\n", rc);
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
- trace_smb3_write_done(xid, req->PersistentFileId,
- io_parms->tcon->tid,
- io_parms->tcon->ses->Suid,
- io_parms->offset, *nbytes);
+ trace_smb3_write_done(xid,
+ req->PersistentFileId,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, *nbytes);
}
cifs_small_buf_release(req);
@@ -4866,7 +4995,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
- rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, persistent_fid,
tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
@@ -4914,7 +5043,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
@@ -5074,7 +5203,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFid = volatile_fid;
req->PersistentFid = persistent_fid;
req->OplockLevel = oplock_level;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
flags |= CIFS_NO_RSP_BUF;
@@ -5376,7 +5505,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->LockCount = cpu_to_le16(num_lock);
req->PersistentFileId = persist_fid;
@@ -5452,7 +5581,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
req->StructureSize = cpu_to_le16(36);
total_len += 12;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f32c99c9ba13..d8c4388b190d 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -14,156 +14,12 @@
#include <net/sock.h>
#include "cifsacl.h"
-/*
- * Note that, due to trying to use names similar to the protocol specifications,
- * there are many mixed case field names in the structures below. Although
- * this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
- *
- * SMB2 commands
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- * (ie no useful data other than the SMB error code itself) and are marked such.
- * Knowing this helps avoid response buffer allocations and copy in some cases.
- */
-
-/* List of commands in host endian */
-#define SMB2_NEGOTIATE_HE 0x0000
-#define SMB2_SESSION_SETUP_HE 0x0001
-#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
-#define SMB2_TREE_CONNECT_HE 0x0003
-#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
-#define SMB2_CREATE_HE 0x0005
-#define SMB2_CLOSE_HE 0x0006
-#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
-#define SMB2_READ_HE 0x0008
-#define SMB2_WRITE_HE 0x0009
-#define SMB2_LOCK_HE 0x000A
-#define SMB2_IOCTL_HE 0x000B
-#define SMB2_CANCEL_HE 0x000C
-#define SMB2_ECHO_HE 0x000D
-#define SMB2_QUERY_DIRECTORY_HE 0x000E
-#define SMB2_CHANGE_NOTIFY_HE 0x000F
-#define SMB2_QUERY_INFO_HE 0x0010
-#define SMB2_SET_INFO_HE 0x0011
-#define SMB2_OPLOCK_BREAK_HE 0x0012
-
-/* The same list in little endian */
-#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
-#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
-#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
-#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
-#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
-#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
-#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
-#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
-#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
-#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
-#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
-#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
-#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
-#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
-#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
-#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
-#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
-#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
-#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
-
-#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
-
-#define NUMBER_OF_SMB2_COMMANDS 0x0013
-
/* 52 transform hdr + 64 hdr + 88 create rsp */
#define SMB2_TRANSFORM_HEADER_SIZE 52
#define MAX_SMB2_HDR_SIZE 204
-#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
-#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
-
-/*
- * SMB2 Header Definition
- *
- * "MBZ" : Must be Zero
- * "BB" : BugBug, Something to check/review/analyze later
- * "PDU" : "Protocol Data Unit" (ie a network "frame")
- *
- */
-
-#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
-
-struct smb2_sync_hdr {
- __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
- __le16 StructureSize; /* 64 */
- __le16 CreditCharge; /* MBZ */
- __le32 Status; /* Error from server */
- __le16 Command;
- __le16 CreditRequest; /* CreditResponse */
- __le32 Flags;
- __le32 NextCommand;
- __le64 MessageId;
- __le32 ProcessId;
- __u32 TreeId; /* opaque - so do not make little endian */
- __u64 SessionId; /* opaque - so do not make little endian */
- __u8 Signature[16];
-} __packed;
-
/* The total header size for SMB2 read and write */
-#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
-
-struct smb2_sync_pdu {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
-#define SMB3_AES_CCM_NONCE 11
-#define SMB3_AES_GCM_NONCE 12
-
-/* Transform flags (for 3.0 dialect this flag indicates CCM */
-#define TRANSFORM_FLAG_ENCRYPTED 0x0001
-struct smb2_transform_hdr {
- __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
- __u8 Signature[16];
- __u8 Nonce[16];
- __le32 OriginalMessageSize;
- __u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
- __u64 SessionId;
-} __packed;
-
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le16 Length; /* if chained it is length, else offset */
-} __packed;
-
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE 0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
-
-struct compression_payload_header {
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le32 Length; /* length of compressed playload including field below if present */
- /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- /* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
- __le16 Pattern;
- __le16 Reserved1;
- __le16 Reserved2;
- __le32 Repetitions;
-} __packed;
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr))
/* See MS-SMB2 2.2.43 */
struct smb2_rdma_transform {
@@ -190,17 +46,6 @@ struct smb2_rdma_crypto_transform {
} __packed;
/*
- * SMB2 flag definitions
- */
-#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
-#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
-#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
-
-/*
* Definitions for SMB2 Protocol Data Units (network frames)
*
* See MS-SMB2.PDF specification for protocol details.
@@ -211,16 +56,6 @@ struct smb2_rdma_crypto_transform {
#define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL
-#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
-
-struct smb2_err_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Reserved; /* MBZ */
- __le32 ByteCount; /* even if zero, at least one byte follows */
- __u8 ErrorData[1]; /* variable length */
-} __packed;
-
#define SYMLINK_ERROR_TAG 0x4c4d5953
struct smb2_symlink_err_rsp {
@@ -270,530 +105,6 @@ struct share_redirect_error_context_rsp {
/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
} __packed;
-#define SMB2_CLIENT_GUID_SIZE 16
-
-struct smb2_negotiate_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 DialectCount;
- __le16 SecurityMode;
- __le16 Reserved; /* MBZ */
- __le32 Capabilities;
- __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
- /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
- __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
- __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
- __le16 Reserved2;
- __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-/* Dialects */
-#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
-#define SMB20_PROT_ID 0x0202
-#define SMB21_PROT_ID 0x0210
-#define SMB30_PROT_ID 0x0300
-#define SMB302_PROT_ID 0x0302
-#define SMB311_PROT_ID 0x0311
-#define BAD_PROT_ID 0xFFFF
-
-/* SecurityMode flags */
-#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
-#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
-
-/* Capabilities flags */
-#define SMB2_GLOBAL_CAP_DFS 0x00000001
-#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
-/* Internal types */
-#define SMB2_NT_FIND 0x00100000
-#define SMB2_LARGE_FILES 0x00200000
-
-
-/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
-#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
-#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
-#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
-#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
-#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
-
-struct smb2_neg_context {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
-} __packed;
-
-#define SMB311_LINUX_CLIENT_SALT_SIZE 32
-/* Hash Algorithm Types */
-#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
-#define SMB2_PREAUTH_HASH_SIZE 64
-
-/*
- * SaltLength that the server send can be zero, so the only three required
- * fields (all __le16) end up six bytes total, so the minimum context data len
- * in the response is six bytes which accounts for
- *
- * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
- */
-#define MIN_PREAUTH_CTXT_DATA_LEN 6
-
-struct smb2_preauth_neg_context {
- __le16 ContextType; /* 1 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 HashAlgorithmCount; /* 1 */
- __le16 SaltLength;
- __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
- __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE];
-} __packed;
-
-/* Encryption Algorithms Ciphers */
-#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
-#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
-/* we currently do not request AES256_CCM since presumably GCM faster */
-#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
-#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
-
-/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
-#define MIN_ENCRYPT_CTXT_DATA_LEN 4
-struct smb2_encryption_neg_context {
- __le16 ContextType; /* 2 */
- __le16 DataLength;
- __le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
- __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
- __le16 Ciphers[3];
-} __packed;
-
-/* See MS-SMB2 2.2.3.1.3 */
-#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
-#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
-#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
-#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
-/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
-#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
-
-/* Compression Flags */
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
-
-struct smb2_compression_capabilities_context {
- __le16 ContextType; /* 3 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 CompressionAlgorithmCount;
- __u16 Padding;
- __u32 Flags;
- __le16 CompressionAlgorithms[3];
- __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
- /* Check if pad needed */
-} __packed;
-
-/*
- * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
- * Its struct simply contains NetName, an array of Unicode characters
- */
-struct smb2_netname_neg_context {
- __le16 ContextType; /* 5 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 NetName[]; /* hostname of target converted to UCS-2 */
-} __packed;
-
-/*
- * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
- * and 2.2.4.1.5
- */
-
-/* Flags */
-#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
-
-struct smb2_transport_capabilities_context {
- __le16 ContextType; /* 6 */
- __le16 DataLength;
- __u32 Reserved;
- __le32 Flags;
- __u32 Pad;
-} __packed;
-
-/*
- * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
- * and 2.2.4.1.6
- */
-
-/* RDMA Transform IDs */
-#define SMB2_RDMA_TRANSFORM_NONE 0x0000
-#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
-#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
-
-struct smb2_rdma_transform_capabilities_context {
- __le16 ContextType; /* 7 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 TransformCount;
- __u16 Reserved1;
- __u32 Reserved2;
- __le16 RDMATransformIds[];
-} __packed;
-
-/*
- * For signing capabilities context see MS-SMB2 2.2.3.1.7
- * and 2.2.4.1.7
- */
-
-/* Signing algorithms */
-#define SIGNING_ALG_HMAC_SHA256 0
-#define SIGNING_ALG_AES_CMAC 1
-#define SIGNING_ALG_AES_GMAC 2
-
-struct smb2_signing_capabilities {
- __le16 ContextType; /* 8 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 SigningAlgorithmCount;
- __le16 SigningAlgorithms[];
- /* Followed by padding to 8 byte boundary (required by some servers) */
-} __packed;
-
-#define POSIX_CTXT_DATA_LEN 16
-struct smb2_posix_neg_context {
- __le16 ContextType; /* 0x100 */
- __le16 DataLength;
- __le32 Reserved;
- __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
-} __packed;
-
-struct smb2_negotiate_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 65 */
- __le16 SecurityMode;
- __le16 DialectRevision;
- __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
- __u8 ServerGUID[16];
- __le32 Capabilities;
- __le32 MaxTransactSize;
- __le32 MaxReadSize;
- __le32 MaxWriteSize;
- __le64 SystemTime; /* MBZ */
- __le64 ServerStartTime;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
-
-struct smb2_sess_setup_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 25 */
- __u8 Flags;
- __u8 SecurityMode;
- __le32 Capabilities;
- __le32 Channel;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u64 PreviousSessionId;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Currently defined SessionFlags */
-#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
-#define SMB2_SESSION_FLAG_IS_NULL 0x0002
-#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
-struct smb2_sess_setup_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 SessionFlags;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-struct smb2_logoff_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_logoff_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* Flags/Reserved for SMB3.1.1 */
-#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
-#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
-#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
-
-struct smb2_tree_connect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
- __le16 PathOffset;
- __le16 PathLength;
- __u8 Buffer[1]; /* variable length */
-} __packed;
-
-/* See MS-SMB2 section 2.2.9.2 */
-/* Context Types */
-#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
-#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
-
-struct tree_connect_contexts {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- __u8 Data[];
-} __packed;
-
-/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
-struct smb3_blob_data {
- __le16 BlobSize;
- __u8 BlobData[];
-} __packed;
-
-/* Valid values for Attr */
-#define SE_GROUP_MANDATORY 0x00000001
-#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
-#define SE_GROUP_ENABLED 0x00000004
-#define SE_GROUP_OWNER 0x00000008
-#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
-#define SE_GROUP_INTEGRITY 0x00000020
-#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
-#define SE_GROUP_RESOURCE 0x20000000
-#define SE_GROUP_LOGON_ID 0xC0000000
-
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
-struct sid_array_data {
- __le16 SidAttrCount;
- /* SidAttrList - array of sid_attr_data structs */
-} __packed;
-
-struct luid_attr_data {
-
-} __packed;
-
-/*
- * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
- * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
- */
-
-struct privilege_array_data {
- __le16 PrivilegeCount;
- /* array of privilege_data structs */
-} __packed;
-
-struct remoted_identity_tcon_context {
- __le16 TicketType; /* must be 0x0001 */
- __le16 TicketSize; /* total size of this struct */
- __le16 User; /* offset to SID_ATTR_DATA struct with user info */
- __le16 UserName; /* offset to null terminated Unicode username string */
- __le16 Domain; /* offset to null terminated Unicode domain name */
- __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
- __le16 RestrictedGroups; /* similar to above */
- __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
- __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
- __le16 Owner; /* offset to BLOB_DATA struct */
- __le16 DefaultDacl; /* offset to BLOB_DATA struct */
- __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
- __le16 UserClaims; /* offset to BLOB_DATA struct */
- __le16 DeviceClaims; /* offset to BLOB_DATA struct */
- __u8 TicketInfo[]; /* variable length buf - remoted identity data */
-} __packed;
-
-struct smb2_tree_connect_req_extension {
- __le32 TreeConnectContextOffset;
- __le16 TreeConnectContextCount;
- __u8 Reserved[10];
- __u8 PathName[]; /* variable sized array */
- /* followed by array of TreeConnectContexts */
-} __packed;
-
-struct smb2_tree_connect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 16 */
- __u8 ShareType; /* see below */
- __u8 Reserved;
- __le32 ShareFlags; /* see below */
- __le32 Capabilities; /* see below */
- __le32 MaximalAccess;
-} __packed;
-
-/* Possible ShareType values */
-#define SMB2_SHARE_TYPE_DISK 0x01
-#define SMB2_SHARE_TYPE_PIPE 0x02
-#define SMB2_SHARE_TYPE_PRINT 0x03
-
-/*
- * Possible ShareFlags - exactly one and only one of the first 4 caching flags
- * must be set (any of the remaining, SHI1005, flags may be set individually
- * or in combination.
- */
-#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
-#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
-#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
-#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
-#define SHI1005_FLAGS_DFS 0x00000001
-#define SHI1005_FLAGS_DFS_ROOT 0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
-#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
-#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
-#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL 0x0014FF33
-
-/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
-#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
-#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
-#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
-#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
-#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
-
-struct smb2_tree_disconnect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_tree_disconnect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* File Attrubutes */
-#define FILE_ATTRIBUTE_READONLY 0x00000001
-#define FILE_ATTRIBUTE_HIDDEN 0x00000002
-#define FILE_ATTRIBUTE_SYSTEM 0x00000004
-#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
-#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
-#define FILE_ATTRIBUTE_NORMAL 0x00000080
-#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
-#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
-#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
-#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
-#define FILE_ATTRIBUTE_OFFLINE 0x00001000
-#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
-#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
-#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
-#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
-
-/* Oplock levels */
-#define SMB2_OPLOCK_LEVEL_NONE 0x00
-#define SMB2_OPLOCK_LEVEL_II 0x01
-#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
-#define SMB2_OPLOCK_LEVEL_BATCH 0x09
-#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
-/* Non-spec internal type */
-#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
-
-/* Desired Access Flags */
-#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
-#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
-#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
-#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
-#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
-#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
-#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
-#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
-#define FILE_DELETE_LE cpu_to_le32(0x00010000)
-#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
-#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
-#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
-#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
-#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
-#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
-#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
-#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
-#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
-#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
-
-/* ShareAccess Flags */
-#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
-#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
-#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
-#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
-
-/* CreateDisposition Flags */
-#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
-#define FILE_OPEN_LE cpu_to_le32(0x00000001)
-#define FILE_CREATE_LE cpu_to_le32(0x00000002)
-#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
-#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
-#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
-
-/* CreateOptions Flags */
-#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
-/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
-#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
-#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
-#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
-#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010)
-#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020)
-#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
-#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
-#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
-#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
-#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
-#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
-#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
-#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000)
-#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
-#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
-#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
-
-#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
- | FILE_READ_ATTRIBUTES_LE)
-#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
- | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
-#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-
-/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
-#define IL_ANONYMOUS cpu_to_le32(0x00000000)
-#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
-#define IL_IMPERSONATION cpu_to_le32(0x00000002)
-#define IL_DELEGATE cpu_to_le32(0x00000003)
-
-/* Create Context Values */
-#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
-#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
-#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
-#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
-#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
-#define SMB2_CREATE_REQUEST_LEASE "RqLs"
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
-#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
-#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
-#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
-
-/* Flag (SMB3 open response) values */
-#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
-
/*
* Maximum number of iovs we need for an open/create request.
* [0] : struct smb2_create_req
@@ -807,26 +118,6 @@ struct smb2_tree_disconnect_rsp {
*/
#define SMB2_CREATE_IOV_SIZE 8
-struct smb2_create_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 57 */
- __u8 SecurityFlags;
- __u8 RequestedOplockLevel;
- __le32 ImpersonationLevel;
- __le64 SmbCreateFlags;
- __le64 Reserved;
- __le32 DesiredAccess;
- __le32 FileAttributes;
- __le32 ShareAccess;
- __le32 CreateDisposition;
- __le32 CreateOptions;
- __le16 NameOffset;
- __le16 NameLength;
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[];
-} __packed;
-
/*
* Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
* 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
@@ -834,82 +125,10 @@ struct smb2_create_req {
*/
#define MAX_SMB2_CREATE_RESPONSE_SIZE 880
-struct smb2_create_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 89 */
- __u8 OplockLevel;
- __u8 Flag; /* 0x01 if reparse point */
- __le32 CreateAction;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize;
- __le64 EndofFile;
- __le32 FileAttributes;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[1];
-} __packed;
-
-struct create_context {
- __le32 Next;
- __le16 NameOffset;
- __le16 NameLength;
- __le16 Reserved;
- __le16 DataOffset;
- __le32 DataLength;
- __u8 Buffer[];
-} __packed;
-
#define SMB2_LEASE_READ_CACHING_HE 0x01
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
-#define SMB2_LEASE_NONE cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002)
-#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004)
-
-#define SMB2_LEASE_KEY_SIZE 16
-
-struct lease_context {
- u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
-} __packed;
-
-struct lease_context_v2 {
- u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
- __le64 ParentLeaseKeyLow;
- __le64 ParentLeaseKeyHigh;
- __le16 Epoch;
- __le16 Reserved;
-} __packed;
-
-struct create_lease {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context lcontext;
-} __packed;
-
-struct create_lease_v2 {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context_v2 lcontext;
- __u8 Pad[4];
-} __packed;
-
struct create_durable {
struct create_context ccontext;
__u8 Name[8];
@@ -922,13 +141,6 @@ struct create_durable {
} Data;
} __packed;
-struct create_posix {
- struct create_context ccontext;
- __u8 Name[16];
- __le32 Mode;
- __u32 Reserved;
-} __packed;
-
/* See MS-SMB2 2.2.13.2.11 */
/* Flags */
#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
@@ -1017,12 +229,6 @@ struct copychunk_ioctl {
__u32 Reserved2;
} __packed;
-/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
-struct file_zero_data_information {
- __le64 FileOffset;
- __le64 BeyondFinalZero;
-} __packed;
-
struct copychunk_ioctl_rsp {
__le32 ChunksWritten;
__le32 ChunkBytesWritten;
@@ -1068,11 +274,6 @@ struct fsctl_get_integrity_information_rsp {
__le32 ClusterSizeInBytes;
} __packed;
-struct file_allocated_range_buffer {
- __le64 file_offset;
- __le64 length;
-} __packed;
-
/* Integrity ChecksumAlgorithm choices for above */
#define CHECKSUM_TYPE_NONE 0x0000
#define CHECKSUM_TYPE_CRC64 0x0002
@@ -1081,53 +282,6 @@ struct file_allocated_range_buffer {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
-/* Reparse structures - see MS-FSCC 2.1.2 */
-
-/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
-
-struct reparse_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-struct reparse_guid_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 ReparseGuid[16];
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-struct reparse_mount_point_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le16 SubstituteNameOffset;
- __le16 SubstituteNameLength;
- __le16 PrintNameOffset;
- __le16 PrintNameLength;
- __u8 PathBuffer[]; /* Variable Length */
-} __packed;
-
-#define SYMLINK_FLAG_RELATIVE 0x00000001
-
-struct reparse_symlink_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le16 SubstituteNameOffset;
- __le16 SubstituteNameLength;
- __le16 PrintNameOffset;
- __le16 PrintNameLength;
- __le32 Flags;
- __u8 PathBuffer[]; /* Variable Length */
-} __packed;
-
-/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
-
-
/* See MS-DFSC 2.2.2 */
struct fsctl_get_dfs_referral_req {
__le16 MaxReferralLevel;
@@ -1143,22 +297,6 @@ struct network_resiliency_req {
} __packed;
/* There is no buffer for the response ie no struct network_resiliency_rsp */
-
-struct validate_negotiate_info_req {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 DialectCount;
- __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-struct validate_negotiate_info_rsp {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 Dialect; /* Dialect in use for the connection */
-} __packed;
-
#define RSS_CAPABLE cpu_to_le32(0x00000001)
#define RDMA_CAPABLE cpu_to_le32(0x00000002)
@@ -1194,14 +332,6 @@ struct compress_ioctl {
__le16 CompressionState; /* See cifspdu.h for possible flag values */
} __packed;
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
-
/*
* Maximum number of iovs we need for an ioctl request.
* [0] : struct smb2_ioctl_req
@@ -1209,525 +339,11 @@ struct duplicate_extents_to_file {
*/
#define SMB2_IOCTL_IOV_SIZE 2
-struct smb2_ioctl_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 57 */
- __u16 Reserved;
- __le32 CtlCode;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 InputOffset;
- __le32 InputCount;
- __le32 MaxInputResponse;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 MaxOutputResponse;
- __le32 Flags;
- __u32 Reserved2;
- __u8 Buffer[];
-} __packed;
-
-struct smb2_ioctl_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 57 */
- __u16 Reserved;
- __le32 CtlCode;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 InputOffset;
- __le32 InputCount;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 Flags;
- __u32 Reserved2;
- /* char * buffer[] */
-} __packed;
-
-/* Currently defined values for close flags */
-#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
-struct smb2_close_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Flags;
- __le32 Reserved;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-/*
- * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
- */
-#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
-
-struct smb2_close_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* 60 */
- __le16 Flags;
- __le32 Reserved;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile;
- __le32 Attributes;
-} __packed;
-
-struct smb2_flush_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Reserved1;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-struct smb2_flush_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Reserved;
-} __packed;
-
-/* For read request Flags field below, following flag is defined for SMB3.02 */
-#define SMB2_READFLAG_READ_UNBUFFERED 0x01
-#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
-
-/* Channel field for read and write: exactly one of following flags can be set*/
-#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
-#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
-#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */
-
-/* SMB2 read request without RFC1001 length at the beginning */
-struct smb2_read_plain_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Flags; /* MBZ unless SMB3.02 or later */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 MinimumCount;
- __le32 Channel; /* MBZ except for SMB3 or later */
- __le32 RemainingBytes;
- __le16 ReadChannelInfoOffset;
- __le16 ReadChannelInfoLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Read flags */
-#define SMB2_READFLAG_RESPONSE_NONE 0x00000000
-#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001
-
-struct smb2_read_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-/* For write request Flags field below the following flags are defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
-#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
-
-struct smb2_write_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 DataOffset; /* offset from start of SMB2 header to write data */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 Channel; /* MBZ unless SMB3.02 or later */
- __le32 RemainingBytes;
- __le16 WriteChannelInfoOffset;
- __le16 WriteChannelInfoLength;
- __le32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_write_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-/* notify flags */
-#define SMB2_WATCH_TREE 0x0001
-
-/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
-#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
-#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
-#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
-#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
-#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
-#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
-#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
-#define FILE_NOTIFY_CHANGE_EA 0x00000080
-#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
-#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
-#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
-#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
-
-struct smb2_change_notify_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Flags;
- __le32 OutputBufferLength;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CompletionFilter;
- __u32 Reserved;
-} __packed;
-
-struct smb2_change_notify_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1]; /* array of file notify structs */
-} __packed;
-
-#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001
-#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002
-#define SMB2_LOCKFLAG_UNLOCK 0x0004
-#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
-
-struct smb2_lock_element {
- __le64 Offset;
- __le64 Length;
- __le32 Flags;
- __le32 Reserved;
-} __packed;
-
-struct smb2_lock_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 48 */
- __le16 LockCount;
- /*
- * The least significant four bits are the index, the other 28 bits are
- * the lock sequence number (0 to 64). See MS-SMB2 2.2.26
- */
- __le32 LockSequenceNumber;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- /* Followed by at least one */
- struct smb2_lock_element locks[1];
-} __packed;
-
-struct smb2_lock_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_echo_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-struct smb2_echo_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-/* search (query_directory) Flags field */
-#define SMB2_RESTART_SCANS 0x01
-#define SMB2_RETURN_SINGLE_ENTRY 0x02
-#define SMB2_INDEX_SPECIFIED 0x04
-#define SMB2_REOPEN 0x10
-
-#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
-
-/*
- * Valid FileInformation classes.
- *
- * Note that these are a subset of the (file) QUERY_INFO levels defined
- * later in this file (but since QUERY_DIRECTORY uses equivalent numbers
- * we do not redefine them here)
- *
- * FileDirectoryInfomation 0x01
- * FileFullDirectoryInformation 0x02
- * FileIdFullDirectoryInformation 0x26
- * FileBothDirectoryInformation 0x03
- * FileIdBothDirectoryInformation 0x25
- * FileNamesInformation 0x0C
- * FileIdExtdDirectoryInformation 0x3C
- */
-
-struct smb2_query_directory_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 FileInformationClass;
- __u8 Flags;
- __le32 FileIndex;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le16 FileNameOffset;
- __le16 FileNameLength;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_directory_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Possible InfoType values */
-#define SMB2_O_INFO_FILE 0x01
-#define SMB2_O_INFO_FILESYSTEM 0x02
-#define SMB2_O_INFO_SECURITY 0x03
-#define SMB2_O_INFO_QUOTA 0x04
-
-/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
-#define OWNER_SECINFO 0x00000001
-#define GROUP_SECINFO 0x00000002
-#define DACL_SECINFO 0x00000004
-#define SACL_SECINFO 0x00000008
-#define LABEL_SECINFO 0x00000010
-#define ATTRIBUTE_SECINFO 0x00000020
-#define SCOPE_SECINFO 0x00000040
-#define BACKUP_SECINFO 0x00010000
-#define UNPROTECTED_SACL_SECINFO 0x10000000
-#define UNPROTECTED_DACL_SECINFO 0x20000000
-#define PROTECTED_SACL_SECINFO 0x40000000
-#define PROTECTED_DACL_SECINFO 0x80000000
-
-/* Flags used for FileFullEAinfo */
-#define SL_RESTART_SCAN 0x00000001
-#define SL_RETURN_SINGLE_ENTRY 0x00000002
-#define SL_INDEX_SPECIFIED 0x00000004
-
-struct smb2_query_info_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 41 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 OutputBufferLength;
- __le16 InputBufferOffset;
- __u16 Reserved;
- __le32 InputBufferLength;
- __le32 AdditionalInformation;
- __le32 Flags;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_info_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
/*
- * Maximum number of iovs we need for a set-info request.
- * The largest one is rename/hardlink
- * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
- * [1] : path
- * [2] : compound padding
- */
-#define SMB2_SET_INFO_IOV_SIZE 3
-
-struct smb2_set_info_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 BufferLength;
- __le16 BufferOffset;
- __u16 Reserved;
- __le32 AdditionalInformation;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 2 */
-} __packed;
-
-struct smb2_oplock_break {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __u64 PersistentFid;
- __u64 VolatileFid;
-} __packed;
-
-#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
-
-struct smb2_lease_break {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 44 */
- __le16 Epoch;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 CurrentLeaseState;
- __le32 NewLeaseState;
- __le32 BreakReason;
- __le32 AccessMaskHint;
- __le32 ShareMaskHint;
-} __packed;
-
-struct smb2_lease_ack {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 Reserved;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 LeaseState;
- __le64 LeaseDuration;
-} __packed;
-
-/*
- * PDU infolevel structure definitions
+ * PDU query infolevel structure definitions
* BB consider moving to a different header
*/
-/* File System Information Classes */
-#define FS_VOLUME_INFORMATION 1 /* Query */
-#define FS_LABEL_INFORMATION 2 /* Local only */
-#define FS_SIZE_INFORMATION 3 /* Query */
-#define FS_DEVICE_INFORMATION 4 /* Query */
-#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
-#define FS_CONTROL_INFORMATION 6 /* Query, Set */
-#define FS_FULL_SIZE_INFORMATION 7 /* Query */
-#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */
-#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */
-#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
-#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
-
-struct smb2_fs_full_size_info {
- __le64 TotalAllocationUnits;
- __le64 CallerAvailableAllocationUnits;
- __le64 ActualAvailableAllocationUnits;
- __le32 SectorsPerAllocationUnit;
- __le32 BytesPerSector;
-} __packed;
-
-#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
-#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
-#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
-#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
-
-/* sector size info struct */
-struct smb3_fs_ss_info {
- __le32 LogicalBytesPerSector;
- __le32 PhysicalBytesPerSectorForAtomicity;
- __le32 PhysicalBytesPerSectorForPerf;
- __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
- __le32 Flags;
- __le32 ByteOffsetForSectorAlignment;
- __le32 ByteOffsetForPartitionAlignment;
-} __packed;
-
-/* volume info struct - see MS-FSCC 2.5.9 */
-#define MAX_VOL_LABEL_LEN 32
-struct smb3_fs_vol_info {
- __le64 VolumeCreationTime;
- __u32 VolumeSerialNumber;
- __le32 VolumeLabelLength; /* includes trailing null */
- __u8 SupportsObjects; /* True if eg like NTFS, supports objects */
- __u8 Reserved;
- __u8 VolumeLabel[]; /* variable len */
-} __packed;
-
-/* partial list of QUERY INFO levels */
-#define FILE_DIRECTORY_INFORMATION 1
-#define FILE_FULL_DIRECTORY_INFORMATION 2
-#define FILE_BOTH_DIRECTORY_INFORMATION 3
-#define FILE_BASIC_INFORMATION 4
-#define FILE_STANDARD_INFORMATION 5
-#define FILE_INTERNAL_INFORMATION 6
-#define FILE_EA_INFORMATION 7
-#define FILE_ACCESS_INFORMATION 8
-#define FILE_NAME_INFORMATION 9
-#define FILE_RENAME_INFORMATION 10
-#define FILE_LINK_INFORMATION 11
-#define FILE_NAMES_INFORMATION 12
-#define FILE_DISPOSITION_INFORMATION 13
-#define FILE_POSITION_INFORMATION 14
-#define FILE_FULL_EA_INFORMATION 15
-#define FILE_MODE_INFORMATION 16
-#define FILE_ALIGNMENT_INFORMATION 17
-#define FILE_ALL_INFORMATION 18
-#define FILE_ALLOCATION_INFORMATION 19
-#define FILE_END_OF_FILE_INFORMATION 20
-#define FILE_ALTERNATE_NAME_INFORMATION 21
-#define FILE_STREAM_INFORMATION 22
-#define FILE_PIPE_INFORMATION 23
-#define FILE_PIPE_LOCAL_INFORMATION 24
-#define FILE_PIPE_REMOTE_INFORMATION 25
-#define FILE_MAILSLOT_QUERY_INFORMATION 26
-#define FILE_MAILSLOT_SET_INFORMATION 27
-#define FILE_COMPRESSION_INFORMATION 28
-#define FILE_OBJECT_ID_INFORMATION 29
-/* Number 30 not defined in documents */
-#define FILE_MOVE_CLUSTER_INFORMATION 31
-#define FILE_QUOTA_INFORMATION 32
-#define FILE_REPARSE_POINT_INFORMATION 33
-#define FILE_NETWORK_OPEN_INFORMATION 34
-#define FILE_ATTRIBUTE_TAG_INFORMATION 35
-#define FILE_TRACKING_INFORMATION 36
-#define FILEID_BOTH_DIRECTORY_INFORMATION 37
-#define FILEID_FULL_DIRECTORY_INFORMATION 38
-#define FILE_VALID_DATA_LENGTH_INFORMATION 39
-#define FILE_SHORT_NAME_INFORMATION 40
-#define FILE_SFIO_RESERVE_INFORMATION 44
-#define FILE_SFIO_VOLUME_INFORMATION 45
-#define FILE_HARD_LINK_INFORMATION 46
-#define FILE_NORMALIZED_NAME_INFORMATION 48
-#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
-#define FILE_STANDARD_LINK_INFORMATION 54
-#define FILE_ID_INFORMATION 59
-#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60
-
-struct smb2_file_internal_info {
- __le64 IndexNumber;
-} __packed; /* level 6 Query */
-
-struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[]; /* New name to be assigned */
- /* padding - overall struct size must be >= 24 so filename + pad >= 6 */
-} __packed; /* level 10 Set */
-
-struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[]; /* Name to be assigned to new link */
-} __packed; /* level 11 Set */
-
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__le32 next_entry_offset;
__u8 flags;
@@ -1736,38 +352,6 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */
char ea_data[]; /* \0 terminated name plus value */
} __packed; /* level 15 Set */
-/*
- * This level 18, although with struct with same name is different from cifs
- * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
- * CurrentByteOffset.
- */
-struct smb2_file_all_info { /* data block encoding of response to level 18 */
- __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 Attributes;
- __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile; /* size ie offset to first free byte in file */
- __le32 NumberOfLinks; /* hard links */
- __u8 DeletePending;
- __u8 Directory;
- __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
- __le64 IndexNumber;
- __le32 EASize;
- __le32 AccessFlags;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
- __le32 FileNameLength;
- char FileName[1];
-} __packed; /* level 18 Query */
-
-struct smb2_file_eof_info { /* encoding of request for level 10 */
- __le64 EndOfFile; /* new end of file value */
-} __packed; /* level 20 Set */
-
struct smb2_file_reparse_point_info {
__le64 IndexNumber;
__le32 Tag;
@@ -1820,6 +404,8 @@ struct create_posix_rsp {
struct cifs_sid group; /* var-sized on the wire */
} __packed;
+#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
+
/*
* SMB2-only POSIX info level for query dir
*
@@ -1851,31 +437,6 @@ struct smb2_posix_info {
*/
} __packed;
-/* Level 100 query info */
-struct smb311_posix_qinfo {
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 EndOfFile;
- __le64 AllocationSize;
- __le32 DosAttributes;
- __le64 Inode;
- __le32 DeviceId;
- __le32 Zero;
- /* beginning of POSIX Create Context Response */
- __le32 HardLinks;
- __le32 ReparseTag;
- __le32 Mode;
- u8 Sids[];
- /*
- * var sized owner SID
- * var sized group SID
- * le32 filenamelength
- * u8 filename[]
- */
-} __packed;
-
/*
* Parsed version of the above struct. Allows direct access to the
* variable length fields
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 547945443fa7..a69f1eed1cfe 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -25,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
extern char *smb2_get_data_area_len(int *off, int *len,
- struct smb2_sync_hdr *shdr);
+ struct smb2_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
struct cifs_sb_info *cifs_sb);
@@ -123,8 +123,11 @@ extern void smb2_set_related(struct smb_rqst *rqst);
* SMB2 Worker functions - most of protocol specific implementation details
* are contained within these calls.
*/
-extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses);
+extern int SMB2_negotiate(const unsigned int xid,
+ struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct nls_table *nls_cp);
extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses);
extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
@@ -276,10 +279,11 @@ extern void smb2_copy_fs_info_to_kstatfs(
struct kstatfs *kst);
extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
extern int smb311_update_preauth_hash(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
struct kvec *iov, int nvec);
extern int smb2_query_info_compound(const unsigned int xid,
struct cifs_tcon *tcon,
- __le16 *utf16_path, u32 desired_access,
+ const char *path, u32 desired_access,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index f59b956f9d25..2af79093b78b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -19,7 +19,6 @@
#include <linux/mempool.h>
#include <linux/highmem.h>
#include <crypto/aead.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -101,13 +100,16 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
goto out;
found:
- if (ses->binding) {
+ spin_lock(&ses->chan_lock);
+ if (cifs_chan_needs_reconnect(ses, server) &&
+ !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) {
/*
* If we are in the process of binding a new channel
* to an existing session, use the master connection
* session key
*/
memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE);
+ spin_unlock(&ses->chan_lock);
goto out;
}
@@ -119,9 +121,11 @@ found:
chan = ses->chans + i;
if (chan->server == server) {
memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE);
+ spin_unlock(&ses->chan_lock);
goto out;
}
}
+ spin_unlock(&ses->chan_lock);
cifs_dbg(VFS,
"%s: Could not find channel signing key for session 0x%llx\n",
@@ -213,14 +217,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
unsigned char *sigptr = smb2_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
- ses = smb2_find_smb_ses(server, shdr->SessionId);
+ ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
if (!ses) {
cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
return 0;
@@ -391,12 +395,18 @@ struct derivation_triplet {
static int
generate_smb3signingkey(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
const struct derivation_triplet *ptriplet)
{
int rc;
-#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
- struct TCP_Server_Info *server = ses->server;
-#endif
+ bool is_binding = false;
+ int chan_index = 0;
+
+ spin_lock(&ses->chan_lock);
+ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ /* TODO: introduce ref counting for channels when the can be freed */
+ spin_unlock(&ses->chan_lock);
/*
* All channels use the same encryption/decryption keys but
@@ -408,10 +418,10 @@ generate_smb3signingkey(struct cifs_ses *ses,
* master connection signing key stored in the session
*/
- if (ses->binding) {
+ if (is_binding) {
rc = generate_key(ses, ptriplet->signing.label,
ptriplet->signing.context,
- cifs_ses_binding_channel(ses)->signkey,
+ ses->chans[chan_index].signkey,
SMB3_SIGN_KEY_SIZE);
if (rc)
return rc;
@@ -423,8 +433,11 @@ generate_smb3signingkey(struct cifs_ses *ses,
if (rc)
return rc;
+ /* safe to access primary channel, since it will never go away */
+ spin_lock(&ses->chan_lock);
memcpy(ses->chans[0].signkey, ses->smb3signingkey,
SMB3_SIGN_KEY_SIZE);
+ spin_unlock(&ses->chan_lock);
rc = generate_key(ses, ptriplet->encryption.label,
ptriplet->encryption.context,
@@ -471,7 +484,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
}
int
-generate_smb30signingkey(struct cifs_ses *ses)
+generate_smb30signingkey(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
struct derivation_triplet triplet;
@@ -495,11 +509,12 @@ generate_smb30signingkey(struct cifs_ses *ses)
d->context.iov_base = "ServerOut";
d->context.iov_len = 10;
- return generate_smb3signingkey(ses, &triplet);
+ return generate_smb3signingkey(ses, server, &triplet);
}
int
-generate_smb311signingkey(struct cifs_ses *ses)
+generate_smb311signingkey(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
{
struct derivation_triplet triplet;
@@ -523,7 +538,7 @@ generate_smb311signingkey(struct cifs_ses *ses)
d->context.iov_base = ses->preauth_sha_hash;
d->context.iov_len = 64;
- return generate_smb3signingkey(ses, &triplet);
+ return generate_smb3signingkey(ses, server, &triplet);
}
int
@@ -534,14 +549,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
- rc = smb2_get_sign_key(shdr->SessionId, server, key);
+ rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
if (rc)
return 0;
@@ -611,12 +626,12 @@ static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int rc = 0;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct smb2_sess_setup_req *ssr;
bool is_binding;
bool is_signed;
- shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
ssr = (struct smb2_sess_setup_req *)shdr;
is_binding = shdr->Command == SMB2_SESSION_SETUP &&
@@ -625,8 +640,12 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!is_signed)
return 0;
- if (server->tcpStatus == CifsNeedNegotiate)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsNeedNegotiate) {
+ spin_unlock(&cifs_tcp_ses_lock);
return 0;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
if (!is_binding && !server->session_estab) {
strncpy(shdr->Signature, "BSRSPYL", 8);
return 0;
@@ -642,8 +661,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
unsigned int rc;
char server_response_sig[SMB2_SIGNATURE_SIZE];
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
if ((shdr->Command == SMB2_NEGOTIATE) ||
(shdr->Command == SMB2_SESSION_SETUP) ||
@@ -689,7 +708,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
*/
static inline void
smb2_seq_num_into_buf(struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr)
+ struct smb2_hdr *shdr)
{
unsigned int i, num = le16_to_cpu(shdr->CreditCharge);
@@ -700,7 +719,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server,
}
static struct mid_q_entry *
-smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
+smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
@@ -732,39 +751,51 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
- trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), temp->mid);
+ trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
static int
smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
+ struct smb2_hdr *shdr, struct mid_q_entry **mid)
{
- if (server->tcpStatus == CifsExiting)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
+ }
if (server->tcpStatus == CifsNeedReconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
if (server->tcpStatus == CifsNeedNegotiate &&
- shdr->Command != SMB2_NEGOTIATE)
+ shdr->Command != SMB2_NEGOTIATE) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
+ }
if (ses->status == CifsNew) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
- (shdr->Command != SMB2_NEGOTIATE))
+ (shdr->Command != SMB2_NEGOTIATE)) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
+ }
/* else ok - we are setting up session */
}
if (ses->status == CifsExiting) {
- if (shdr->Command != SMB2_LOGOFF)
+ if (shdr->Command != SMB2_LOGOFF) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
+ }
/* else ok - we are shutting down the session */
}
+ spin_unlock(&cifs_tcp_ses_lock);
*mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
@@ -807,8 +838,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
smb2_seq_num_into_buf(server, shdr);
@@ -833,13 +864,17 @@ struct mid_q_entry *
smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsNeedNegotiate &&
- shdr->Command != SMB2_NEGOTIATE)
+ shdr->Command != SMB2_NEGOTIATE) {
+ spin_unlock(&cifs_tcp_ses_lock);
return ERR_PTR(-EAGAIN);
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
smb2_seq_num_into_buf(server, shdr);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index dafcb6ab050d..bc279616c513 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -11,6 +11,8 @@
#define _CIFS_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/inet.h>
/*
* Please use this 3-part article as a reference for writing new tracepoints:
@@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DECLARE_EVENT_CLASS(smb3_connect_class,
+ TP_PROTO(char *hostname,
+ __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr),
+ TP_ARGS(hostname, conn_id, dst_addr),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_EVENT(name) \
+DEFINE_EVENT(smb3_connect_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr), \
+ TP_ARGS(hostname, conn_id, addr))
+
+DEFINE_SMB3_CONNECT_EVENT(connect_done);
+
+DECLARE_EVENT_CLASS(smb3_connect_err_class,
+ TP_PROTO(char *hostname, __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr, int rc),
+ TP_ARGS(hostname, conn_id, dst_addr, rc),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ __entry->rc = rc;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->rc,
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr, \
+ int rc), \
+ TP_ARGS(hostname, conn_id, addr, rc))
+
+DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err);
+
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
__u64 conn_id,
@@ -935,6 +1006,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
DEFINE_SMB3_CREDIT_EVENT(insufficient_credits);
DEFINE_SMB3_CREDIT_EVENT(too_many_credits);
DEFINE_SMB3_CREDIT_EVENT(add_credits);
+DEFINE_SMB3_CREDIT_EVENT(adj_credits);
+DEFINE_SMB3_CREDIT_EVENT(hdr_credits);
+DEFINE_SMB3_CREDIT_EVENT(nblk_credits);
+DEFINE_SMB3_CREDIT_EVENT(pend_credits);
+DEFINE_SMB3_CREDIT_EVENT(wait_credits);
+DEFINE_SMB3_CREDIT_EVENT(waitff_credits);
+DEFINE_SMB3_CREDIT_EVENT(overflow_credits);
DEFINE_SMB3_CREDIT_EVENT(set_credits);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b7379329b741..c667e6ddfe2f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -430,9 +430,7 @@ unmask:
* be taken as the remainder of this one. We need to kill the
* socket so the server throws away the partial SMB
*/
- spin_lock(&GlobalMid_Lock);
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
+ cifs_signal_cifsd_for_reconnect(server, false);
trace_smb3_partial_send_reconnect(server->CurrentMid,
server->conn_id, server->hostname);
}
@@ -466,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return -EIO;
}
- tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS);
if (!tr_hdr)
return -ENOMEM;
memset(&cur_rqst[0], 0, sizeof(cur_rqst));
memset(&iov, 0, sizeof(iov));
- memset(tr_hdr, 0, sizeof(*tr_hdr));
iov.iov_base = tr_hdr;
iov.iov_len = sizeof(*tr_hdr);
@@ -544,7 +541,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_nblk_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -1, in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
__func__, 1, scredits);
@@ -578,10 +575,14 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
return -ERESTARTSYS;
spin_lock(&server->req_lock);
} else {
+ spin_unlock(&server->req_lock);
+
+ spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&server->req_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
}
+ spin_unlock(&cifs_tcp_ses_lock);
/*
* For normal commands, reserve the last MAX_COMPOUND
@@ -596,6 +597,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
* for servers that are slow to hand out credits on
* new sessions.
*/
+ spin_lock(&server->req_lock);
if (!optype && num_credits == 1 &&
server->in_flight > 2 * MAX_COMPOUND &&
*credits <= MAX_COMPOUND) {
@@ -645,7 +647,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_waitff_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
-(num_credits), in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
@@ -723,28 +725,25 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
- if (ses->server->tcpStatus == CifsExiting) {
- return -ENOENT;
- }
-
- if (ses->server->tcpStatus == CifsNeedReconnect) {
- cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
- return -EAGAIN;
- }
-
+ spin_lock(&cifs_tcp_ses_lock);
if (ses->status == CifsNew) {
if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
- (in_buf->Command != SMB_COM_NEGOTIATE))
+ (in_buf->Command != SMB_COM_NEGOTIATE)) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
+ }
/* else ok - we are setting up session */
}
if (ses->status == CifsExiting) {
/* check if SMB session is bad because we are setting it up */
- if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+ if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
+ }
/* else ok - we are shutting down session */
}
+ spin_unlock(&cifs_tcp_ses_lock);
*ppmidQ = AllocMidQEntry(in_buf, ses->server);
if (*ppmidQ == NULL)
@@ -1044,16 +1043,14 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!ses)
return NULL;
- if (!ses->binding) {
- /* round robin */
- if (ses->chan_count > 1) {
- index = (uint)atomic_inc_return(&ses->chan_seq);
- index %= ses->chan_count;
- }
- return ses->chans[index].server;
- } else {
- return cifs_ses_server(ses);
- }
+ /* round robin */
+ index = (uint)atomic_inc_return(&ses->chan_seq);
+
+ spin_lock(&ses->chan_lock);
+ index %= ses->chan_count;
+ spin_unlock(&ses->chan_lock);
+
+ return ses->chans[index].server;
}
int
@@ -1081,8 +1078,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- if (server->tcpStatus == CifsExiting)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
/*
* Wait for all the requests to become available.
@@ -1185,12 +1186,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
+ spin_lock(&cifs_tcp_ses_lock);
if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+
mutex_lock(&server->srv_mutex);
- smb311_update_preauth_hash(ses, rqst[0].rq_iov,
- rqst[0].rq_nvec);
+ smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec);
mutex_unlock(&server->srv_mutex);
+
+ spin_lock(&cifs_tcp_ses_lock);
}
+ spin_unlock(&cifs_tcp_ses_lock);
for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(server, midQ[i]);
@@ -1253,15 +1259,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
+ spin_lock(&cifs_tcp_ses_lock);
if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
struct kvec iov = {
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
};
+ spin_unlock(&cifs_tcp_ses_lock);
mutex_lock(&server->srv_mutex);
- smb311_update_preauth_hash(ses, &iov, 1);
+ smb311_update_preauth_hash(ses, server, &iov, 1);
mutex_unlock(&server->srv_mutex);
+ spin_lock(&cifs_tcp_ses_lock);
}
+ spin_unlock(&cifs_tcp_ses_lock);
out:
/*
@@ -1350,8 +1360,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- if (server->tcpStatus == CifsExiting)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1491,8 +1505,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- if (server->tcpStatus == CifsExiting)
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
return -ENOENT;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1550,10 +1568,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
+ spin_lock(&cifs_tcp_ses_lock);
if ((rc == -ERESTARTSYS) &&
(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
+ spin_unlock(&cifs_tcp_ses_lock);
if (in_buf->Command == SMB_COM_TRANSACTION2) {
/* POSIX lock. We send a NT_CANCEL SMB to cause the
@@ -1592,7 +1612,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* We got the response - restart system call. */
rstart = 1;
+ spin_lock(&cifs_tcp_ses_lock);
}
+ spin_unlock(&cifs_tcp_ses_lock);
rc = cifs_sync_mid_result(midQ, server);
if (rc != 0)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 7d8b72d67c80..9d486fbbfbbd 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
switch (handler->flags) {
case XATTR_CIFS_NTSD_FULL:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL |
CIFS_ACL_SACL);
break;
case XATTR_CIFS_NTSD:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
case XATTR_CIFS_ACL:
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 06855f6c7902..62a3d2565c26 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -63,9 +63,10 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
struct inode *inode;
struct coda_inode_info *cii;
unsigned long hash = coda_f2i(fid);
+ umode_t inode_type = coda_inode_type(attr);
+retry:
inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -75,11 +76,15 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
inode->i_ino = hash;
/* inode is locked and unique, no need to grab cii->c_lock */
cii->c_mapcount = 0;
+ coda_fill_inode(inode, attr);
unlock_new_inode(inode);
+ } else if ((inode->i_mode & S_IFMT) != inode_type) {
+ /* Inode has changed type, mark bad and grab a new one */
+ remove_inode_hash(inode);
+ coda_flag_inode(inode, C_PURGE);
+ iput(inode);
+ goto retry;
}
-
- /* always replace the attributes, type might have changed */
- coda_fill_inode(inode, attr);
return inode;
}
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2e1a5a192074..903ca8fa4b9b 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -87,28 +87,27 @@ static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
}
/* utility functions below */
+umode_t coda_inode_type(struct coda_vattr *attr)
+{
+ switch (attr->va_type) {
+ case C_VREG:
+ return S_IFREG;
+ case C_VDIR:
+ return S_IFDIR;
+ case C_VLNK:
+ return S_IFLNK;
+ case C_VNON:
+ default:
+ return 0;
+ }
+}
+
void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
{
- int inode_type;
- /* inode's i_flags, i_ino are set by iget
- XXX: is this all we need ??
- */
- switch (attr->va_type) {
- case C_VNON:
- inode_type = 0;
- break;
- case C_VREG:
- inode_type = S_IFREG;
- break;
- case C_VDIR:
- inode_type = S_IFDIR;
- break;
- case C_VLNK:
- inode_type = S_IFLNK;
- break;
- default:
- inode_type = 0;
- }
+ /* inode's i_flags, i_ino are set by iget
+ * XXX: is this all we need ??
+ */
+ umode_t inode_type = coda_inode_type(attr);
inode->i_mode |= inode_type;
if (attr->va_mode != (u_short) -1)
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index e7b27754ce78..9be281bbcc06 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -53,10 +53,11 @@ int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
u32, unsigned int);
int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
-/* this file: heloers */
+/* this file: helpers */
char *coda_f2s(struct CodaFid *f);
int coda_iscontrol(const char *name, size_t length);
+umode_t coda_inode_type(struct coda_vattr *attr);
void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
unsigned short coda_flags_to_cflags(unsigned short);
@@ -83,6 +84,9 @@ static __inline__ void coda_flag_inode(struct inode *inode, int flag)
{
struct coda_inode_info *cii = ITOC(inode);
+ if (!inode)
+ return;
+
spin_lock(&cii->c_lock);
cii->c_flags |= flag;
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index d69989c1bac3..328d7a684b63 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -317,13 +317,10 @@ static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
- coda_dir_update_mtime(old_dir);
- coda_dir_update_mtime(new_dir);
coda_flag_inode(d_inode(new_dentry), C_VATTR);
- } else {
- coda_flag_inode(old_dir, C_VATTR);
- coda_flag_inode(new_dir, C_VATTR);
}
+ coda_dir_update_mtime(old_dir);
+ coda_dir_update_mtime(new_dir);
}
return error;
}
@@ -499,15 +496,20 @@ out:
*/
static int coda_dentry_delete(const struct dentry * dentry)
{
- int flags;
+ struct inode *inode;
+ struct coda_inode_info *cii;
if (d_really_is_negative(dentry))
return 0;
- flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
- if (is_bad_inode(d_inode(dentry)) || flags) {
+ inode = d_inode(dentry);
+ if (!inode || is_bad_inode(inode))
return 1;
- }
+
+ cii = ITOC(inode);
+ if (cii->c_flags & C_PURGE)
+ return 1;
+
return 0;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ef5ca22bfb3e..3f3c81e6b1ab 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -8,11 +8,13 @@
* to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
*/
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/time.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/stat.h>
#include <linux/cred.h>
#include <linux/errno.h>
@@ -28,7 +30,7 @@
#include "coda_int.h"
struct coda_vm_ops {
- atomic_t refcnt;
+ refcount_t refcnt;
struct file *coda_file;
const struct vm_operations_struct *host_vm_ops;
struct vm_operations_struct vm_ops;
@@ -98,7 +100,7 @@ coda_vm_open(struct vm_area_struct *vma)
struct coda_vm_ops *cvm_ops =
container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
- atomic_inc(&cvm_ops->refcnt);
+ refcount_inc(&cvm_ops->refcnt);
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
cvm_ops->host_vm_ops->open(vma);
@@ -113,7 +115,7 @@ coda_vm_close(struct vm_area_struct *vma)
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
cvm_ops->host_vm_ops->close(vma);
- if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ if (refcount_dec_and_test(&cvm_ops->refcnt)) {
vma->vm_ops = cvm_ops->host_vm_ops;
fput(cvm_ops->coda_file);
kfree(cvm_ops);
@@ -189,7 +191,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
cvm_ops->vm_ops.open = coda_vm_open;
cvm_ops->vm_ops.close = coda_vm_close;
cvm_ops->coda_file = coda_file;
- atomic_set(&cvm_ops->refcnt, 1);
+ refcount_set(&cvm_ops->refcnt, 1);
vma->vm_ops = &cvm_ops->vm_ops;
}
@@ -238,11 +240,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct inode *host_inode;
- int err;
cfi = coda_ftoc(coda_file);
- err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+ venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
coda_flags, coda_file->f_cred->fsuid);
host_inode = file_inode(cfi->cfi_container);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d9f1bd7153df..2185328b65c7 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep;
static struct inode *coda_alloc_inode(struct super_block *sb)
{
struct coda_inode_info *ei;
- ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 240669f51eac..b39580ad4ce5 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -122,14 +122,10 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
hdr.opcode, hdr.unique);
nbytes = size;
}
- dcbuf = kvmalloc(nbytes, GFP_KERNEL);
- if (!dcbuf) {
- retval = -ENOMEM;
- goto out;
- }
- if (copy_from_user(dcbuf, buf, nbytes)) {
- kvfree(dcbuf);
- retval = -EFAULT;
+
+ dcbuf = vmemdup_user(buf, nbytes);
+ if (IS_ERR(dcbuf)) {
+ retval = PTR_ERR(dcbuf);
goto out;
}
@@ -388,7 +384,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
MODULE_LICENSE("GPL");
-MODULE_VERSION("7.0");
+MODULE_VERSION("7.2");
static int __init init_coda(void)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index eb3b1898da46..59f6cfd06f96 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -744,7 +744,8 @@ static int coda_upcall(struct venus_comm *vcp,
list_add_tail(&req->uc_chain, &vcp->vc_pending);
wake_up_interruptible(&vcp->vc_waitq);
- if (req->uc_flags & CODA_REQ_ASYNC) {
+ /* We can return early on asynchronous requests */
+ if (outSize == NULL) {
mutex_unlock(&vcp->vc_mutex);
return 0;
}
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 95e72d271b95..8f0af4f62631 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -135,6 +135,8 @@
#define elf_format compat_elf_format
#define init_elf_binfmt init_compat_elf_binfmt
#define exit_elf_binfmt exit_compat_elf_binfmt
+#define binfmt_elf_test_cases compat_binfmt_elf_test_cases
+#define binfmt_elf_test_suite compat_binfmt_elf_test_suite
/*
* We share all the actual code with the native (64-bit) version.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 1466b5d01cbb..d1f9d2632202 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -34,6 +34,14 @@
*/
DEFINE_SPINLOCK(configfs_dirent_lock);
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ * But parent configfs_subsystem is NULL when config_item is root.
+ * Use this mutex when config_item is root.
+ */
+static DEFINE_MUTEX(configfs_subsystem_mutex);
+
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
{
@@ -1780,8 +1788,8 @@ void configfs_unregister_group(struct config_group *group)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
+ d_drop(dentry);
fsnotify_rmdir(d_inode(parent), dentry);
- d_delete(dentry);
inode_unlock(d_inode(parent));
dput(dentry);
@@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
sd = root->d_fsdata;
+ mutex_lock(&configfs_subsystem_mutex);
link_group(to_config_group(sd->s_element), group);
+ mutex_unlock(&configfs_subsystem_mutex);
inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
@@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
inode_unlock(d_inode(root));
if (err) {
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
put_fragment(frag);
@@ -1922,16 +1934,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(dentry));
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(root));
dput(dentry);
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 3224dee44d30..ebc43f960b64 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -31,7 +31,6 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
-#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
@@ -41,6 +40,8 @@
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/timekeeping.h>
+#include <linux/sysctl.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -52,9 +53,12 @@
#include <trace/events/sched.h>
-int core_uses_pid;
-unsigned int core_pipe_limit;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
+static bool dump_vma_snapshot(struct coredump_params *cprm);
+static void free_vma_snapshot(struct coredump_params *cprm);
+
+static int core_uses_pid;
+static unsigned int core_pipe_limit;
+static char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
struct core_name {
@@ -62,8 +66,6 @@ struct core_name {
int used, size;
};
-/* The maximal length of core_pattern is also specified in sysctl.c */
-
static int expand_corename(struct core_name *cn, int size)
{
char *corename = krealloc(cn->corename, size, GFP_KERNEL);
@@ -347,19 +349,19 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code, int flags)
+static int zap_process(struct task_struct *start, int exit_code)
{
struct task_struct *t;
int nr = 0;
/* ignore all signals except SIGKILL, see prepare_signal() */
- start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
+ start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- if (t != current && t->mm) {
+ if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
@@ -369,99 +371,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
return nr;
}
-static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+static int zap_threads(struct task_struct *tsk,
struct core_state *core_state, int exit_code)
{
- struct task_struct *g, *p;
- unsigned long flags;
+ struct signal_struct *signal = tsk->signal;
int nr = -EAGAIN;
spin_lock_irq(&tsk->sighand->siglock);
- if (!signal_group_exit(tsk->signal)) {
- mm->core_state = core_state;
- tsk->signal->group_exit_task = tsk;
- nr = zap_process(tsk, exit_code, 0);
+ if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) {
+ signal->core_state = core_state;
+ nr = zap_process(tsk, exit_code);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ tsk->flags |= PF_DUMPCORE;
+ atomic_set(&core_state->nr_threads, nr);
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(nr < 0))
- return nr;
-
- tsk->flags |= PF_DUMPCORE;
- if (atomic_read(&mm->mm_users) == nr + 1)
- goto done;
- /*
- * We should find and kill all tasks which use this mm, and we should
- * count them correctly into ->nr_threads. We don't take tasklist
- * lock, but this is safe wrt:
- *
- * fork:
- * None of sub-threads can fork after zap_process(leader). All
- * processes which were created before this point should be
- * visible to zap_threads() because copy_process() adds the new
- * process to the tail of init_task.tasks list, and lock/unlock
- * of ->siglock provides a memory barrier.
- *
- * do_exit:
- * The caller holds mm->mmap_lock. This means that the task which
- * uses this mm can't pass exit_mm(), so it can't exit or clear
- * its ->mm.
- *
- * de_thread:
- * It does list_replace_rcu(&leader->tasks, &current->tasks),
- * we must see either old or new leader, this does not matter.
- * However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_lock
- * it can't fail.
- *
- * Note also that "g" can be the old leader with ->mm == NULL
- * and already unhashed and thus removed from ->thread_group.
- * This is OK, __unhash_process()->list_del_rcu() does not
- * clear the ->next pointer, we will find the new leader via
- * next_thread().
- */
- rcu_read_lock();
- for_each_process(g) {
- if (g == tsk->group_leader)
- continue;
- if (g->flags & PF_KTHREAD)
- continue;
-
- for_each_thread(g, p) {
- if (unlikely(!p->mm))
- continue;
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code,
- SIGNAL_GROUP_EXIT);
- unlock_task_sighand(p, &flags);
- }
- break;
- }
- }
- rcu_read_unlock();
-done:
- atomic_set(&core_state->nr_threads, nr);
return nr;
}
static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- if (!mm->core_state)
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- mmap_write_unlock(mm);
-
+ core_waiters = zap_threads(tsk, core_state, exit_code);
if (core_waiters > 0) {
struct core_thread *ptr;
@@ -483,7 +420,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}
-static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+static void coredump_finish(bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
@@ -491,24 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
spin_lock_irq(&current->sighand->siglock);
if (core_dumped && !__fatal_signal_pending(current))
current->signal->group_exit_code |= 0x80;
- current->signal->group_exit_task = NULL;
- current->signal->flags = SIGNAL_GROUP_EXIT;
+ next = current->signal->core_state->dumper.next;
+ current->signal->core_state = NULL;
spin_unlock_irq(&current->sighand->siglock);
- next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
task = curr->task;
/*
- * see exit_mm(), curr->task must not see
+ * see coredump_task_exit(), curr->task must not see
* ->task == NULL before we read ->next.
*/
smp_mb();
curr->task = NULL;
wake_up_process(task);
}
-
- mm->core_state = NULL;
}
static bool dump_interrupted(void)
@@ -600,6 +534,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* by any locks.
*/
.mm_flags = mm->flags,
+ .vma_meta = NULL,
};
audit_core_dumps(siginfo->si_signo);
@@ -814,6 +749,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
pr_info("Core dump to |%s disabled\n", cn.corename);
goto close_fail;
}
+ if (!dump_vma_snapshot(&cprm))
+ goto close_fail;
+
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
/*
@@ -827,6 +765,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
dump_emit(&cprm, "", 1);
}
file_end_write(cprm.file);
+ free_vma_snapshot(&cprm);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
@@ -839,7 +778,7 @@ fail_dropcount:
fail_unlock:
kfree(argv);
kfree(cn.corename);
- coredump_finish(mm, core_dumped);
+ coredump_finish(core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
@@ -961,6 +900,63 @@ int dump_align(struct coredump_params *cprm, int align)
}
EXPORT_SYMBOL(dump_align);
+#ifdef CONFIG_SYSCTL
+
+void validate_coredump_safety(void)
+{
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ pr_warn(
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
+ }
+}
+
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
+static struct ctl_table coredump_sysctls[] = {
+ {
+ .procname = "core_uses_pid",
+ .data = &core_uses_pid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "core_pattern",
+ .data = core_pattern,
+ .maxlen = CORENAME_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring_coredump,
+ },
+ {
+ .procname = "core_pipe_limit",
+ .data = &core_pipe_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static int __init init_fs_coredump_sysctls(void)
+{
+ register_sysctl_init("kernel", coredump_sysctls);
+ return 0;
+}
+fs_initcall(init_fs_coredump_sysctls);
+#endif /* CONFIG_SYSCTL */
+
/*
* The purpose of always_dump_vma() is to make sure that special kernel mappings
* that are useful for post-mortem analysis are included in every core dump.
@@ -992,6 +988,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1051,9 +1049,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1090,18 +1099,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
return gate_vma;
}
+static void free_vma_snapshot(struct coredump_params *cprm)
+{
+ if (cprm->vma_meta) {
+ int i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct file *file = cprm->vma_meta[i].file;
+ if (file)
+ fput(file);
+ }
+ kvfree(cprm->vma_meta);
+ cprm->vma_meta = NULL;
+ }
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
*/
-int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
- struct core_vma_metadata **vma_meta,
- size_t *vma_data_size_ptr)
+static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct mm_struct *mm = current->mm;
int i;
- size_t vma_data_size = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1109,36 +1129,51 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
* mmap_lock in read mode.
*/
if (mmap_write_lock_killable(mm))
- return -EINTR;
+ return false;
+ cprm->vma_data_size = 0;
gate_vma = get_gate_vma(mm);
- *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+ cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
- *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
- if (!*vma_meta) {
+ cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
+ if (!cprm->vma_meta) {
mmap_write_unlock(mm);
- return -ENOMEM;
+ return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
- struct core_vma_metadata *m = (*vma_meta) + i;
+ struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+ m->pgoff = vma->vm_pgoff;
- vma_data_size += m->dump_size;
+ m->file = vma->vm_file;
+ if (m->file)
+ get_file(m->file);
}
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count)) {
- kvfree(*vma_meta);
- return -EFAULT;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = cprm->vma_meta + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ cprm->vma_data_size += m->dump_size;
}
- *vma_data_size_ptr = vma_data_size;
- return 0;
+ return true;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 2be65269a987..666aa380011e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_SHIFT;
+ devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 68a2de6b5a9b..2217fe5ece6f 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,23 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
*/
#include <linux/pagemap.h>
@@ -26,6 +13,21 @@
#include <linux/namei.h>
#include "fscrypt_private.h"
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read. The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache. If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable. This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted. The filesystem must not mark such pages uptodate.
+ */
void fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
@@ -52,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
int num_pages = 0;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+ bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+ GFP_NOFS);
while (len) {
unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -60,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
if (num_pages == 0) {
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector =
pblk << (blockbits - SECTOR_SHIFT);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
if (WARN_ON(ret != bytes_this_page)) {
@@ -79,7 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
num_pages = 0;
}
}
@@ -148,12 +149,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
return -EINVAL;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, nr_pages);
+ bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector = pblk << (blockbits - 9);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
i = 0;
offset = 0;
@@ -180,7 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
} while (len != 0);
err = 0;
out:
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 4ef3f714046a..526a4c1bed99 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
}
EXPORT_SYMBOL(fscrypt_free_bounce_page);
+/*
+ * Generate the IV for the given logical block number within the given file.
+ * For filenames encryption, lblk_num == 0.
+ *
+ * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
+ * needs to know about any IV generation methods where the low bits of IV don't
+ * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ */
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci)
{
@@ -240,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
* which must still be locked and not uptodate. Normally, blocksize ==
* PAGE_SIZE and the whole page is decrypted at once.
*
- * This is for use by the filesystem's ->readpages() method.
+ * This is for use by the filesystem's ->readahead() method.
*
* Return: 0 on success; -errno on failure
*/
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index eb538c28df94..a9be4bc74a94 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fscrypt_has_encryption_key(dir)) {
if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len,
- dir->i_sb->s_cop->max_namelen,
+ iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3fa965eb3336..5b0a9e6478b5 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -20,6 +20,11 @@
#define FSCRYPT_FILE_NONCE_SIZE 16
+/*
+ * Minimum size of an fscrypt master key. Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used. This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
*/
struct fscrypt_hkdf hkdf;
- /* Size of the raw key in bytes. Set even if ->raw isn't set. */
+ /*
+ * Size of the raw key in bytes. This remains set even if ->raw was
+ * zeroized due to no longer being needed. I.e. we still remember the
+ * size of the key even if we don't need to remember the key itself.
+ */
u32 size;
/* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
struct fscrypt_mode {
const char *friendly_name;
const char *cipher_str;
- int keysize;
- int ivsize;
+ int keysize; /* key size in bytes */
+ int security_strength; /* security strength in bytes */
+ int ivsize; /* IV size in bytes */
int logged_impl_name;
enum blk_crypto_mode_num blk_crypto_mode;
};
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index e0ec21055505..7607d18b35fc 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -16,9 +16,14 @@
/*
* HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here. A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c57bebfa48fe..93c2ca858092 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -17,6 +17,7 @@
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include "fscrypt_private.h"
@@ -315,6 +316,10 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
*
* fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
*
+ * This function isn't required in cases where crypto-mergeability is ensured in
+ * another way, such as I/O targeting only a single file (and thus a single key)
+ * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity.
+ *
* Return: true iff the I/O is mergeable
*/
bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
@@ -363,3 +368,91 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
return fscrypt_mergeable_bio(bio, inode, next_lblk);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
+
+/**
+ * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
+ * supported as far as encryption is concerned
+ * @iocb: the file and position the I/O is targeting
+ * @iter: the I/O data segment(s)
+ *
+ * Return: %true if there are no encryption constraints that prevent DIO from
+ * being supported; %false if DIO is unsupported. (Note that in the
+ * %true case, the filesystem might have other, non-encryption-related
+ * constraints that prevent DIO from actually being supported.)
+ */
+bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+{
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const unsigned int blocksize = i_blocksize(inode);
+
+ /* If the file is unencrypted, no veto from us. */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return true;
+
+ /* We only support DIO with inline crypto, not fs-layer crypto. */
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+
+ /*
+ * Since the granularity of encryption is filesystem blocks, the file
+ * position and total I/O length must be aligned to the filesystem block
+ * size -- not just to the block device's logical block size as is
+ * traditionally the case for DIO on many filesystems.
+ *
+ * We require that the user-provided memory buffers be filesystem block
+ * aligned too. It is simpler to have a single alignment value required
+ * for all properties of the I/O, as is normally the case for DIO.
+ * Also, allowing less aligned buffers would imply that data units could
+ * cross bvecs, which would greatly complicate the I/O stack, which
+ * assumes that bios can be split at any bvec boundary.
+ */
+ if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
+
+/**
+ * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
+ * @inode: the file on which I/O is being done
+ * @lblk: the block at which the I/O is being started from
+ * @nr_blocks: the number of blocks we want to submit starting at @lblk
+ *
+ * Determine the limit to the number of blocks that can be submitted in a bio
+ * targeting @lblk without causing a data unit number (DUN) discontiguity.
+ *
+ * This is normally just @nr_blocks, as normally the DUNs just increment along
+ * with the logical blocks. (Or the file is not encrypted.)
+ *
+ * In rare cases, fscrypt can be using an IV generation method that allows the
+ * DUN to wrap around within logically contiguous blocks, and that wraparound
+ * will occur. If this happens, a value less than @nr_blocks will be returned
+ * so that the wraparound doesn't occur in the middle of a bio, which would
+ * cause encryption/decryption to produce wrong results.
+ *
+ * Return: the actual number of blocks that can be submitted
+ */
+u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
+{
+ const struct fscrypt_info *ci;
+ u32 dun;
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return nr_blocks;
+
+ if (nr_blocks <= 1)
+ return nr_blocks;
+
+ ci = inode->i_crypt_info;
+ if (!(fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ return nr_blocks;
+
+ /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */
+
+ dun = ci->ci_hashed_ino + lblk;
+
+ return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index bca9c6658a7c..eede186b04ce 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 16,
},
[FSCRYPT_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
@@ -117,8 +122,9 @@ err_free_tfm:
/*
* Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
}
/*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long. (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+ const struct fscrypt_info *ci)
+{
+ unsigned int min_keysize;
+
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+ min_keysize = ci->ci_mode->keysize;
+ else
+ min_keysize = ci->ci_mode->security_strength;
+
+ if (mk->mk_secret.size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+ master_key_spec_type(&mk->mk_spec),
+ master_key_spec_len(&mk->mk_spec),
+ (u8 *)&mk->mk_spec.u,
+ mk->mk_secret.size, min_keysize);
+ return false;
+ }
+ return true;
+}
+
+/*
* Find the master key, then set up the inode's actual encryption key.
*
* If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
goto out_release_key;
}
- /*
- * Require that the master key be at least as long as the derived key.
- * Otherwise, the derived key cannot possibly contain as much entropy as
- * that required by the encryption mode it will be used for. For v1
- * policies it's also required for the KDF to work at all.
- */
- if (mk->mk_secret.size < ci->ci_mode->keysize) {
- fscrypt_warn(NULL,
- "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
- master_key_spec_type(&mk_spec),
- master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
- mk->mk_secret.size, ci->ci_mode->keysize);
+ if (!fscrypt_valid_master_key_size(mk, ci)) {
err = -ENOKEY;
goto out_release_key;
}
diff --git a/fs/d_path.c b/fs/d_path.c
index cd60c7535181..e4e0ebad1f15 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
/**
* prepend_name - prepend a pathname in front of current buffer pointer
- * @buffer: buffer pointer
- * @buflen: allocated length of the buffer
- * @name: name string and length qstr structure
+ * @p: prepend buffer which contains buffer pointer and allocated length
+ * @name: name string and length qstr structure
*
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
@@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry
- * @buffer: pointer to the end of the buffer
- * @buflen: pointer to buffer length
+ * @p: prepend buffer which contains buffer pointer and allocated length
*
* The function will first try to write out the pathname without taking any
* lock other than the RCU read lock to make sure that dentries won't go away.
diff --git a/fs/dax.c b/fs/dax.c
index 4e3e5a283a91..67a08a32fccb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
@@ -390,7 +389,7 @@ static struct page *dax_busy_page(void *entry)
}
/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock_page - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
@@ -709,26 +708,26 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_entry(mapping, index, false);
}
-static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, struct page *to, unsigned long vaddr)
+static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
+ return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
+}
+
+static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
+{
+ pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
void *vto, *kaddr;
- pgoff_t pgoff;
long rc;
int id;
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
-
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
- vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)kaddr, vaddr, to);
+ vto = kmap_atomic(vmf->cow_page);
+ copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
kunmap_atomic(vto);
dax_read_unlock(id);
return 0;
@@ -1005,22 +1004,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
-{
- return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
- const sector_t sector = dax_iomap_sector(iomap, pos);
- pgoff_t pgoff;
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc;
long length;
- rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
- if (rc)
- return rc;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
NULL, pfnp);
@@ -1126,42 +1116,87 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
+static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
+ unsigned int offset, size_t size)
{
- sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
- pgoff_t pgoff;
- long rc, id;
void *kaddr;
- bool page_aligned = false;
- unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ long ret;
- if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- (size == PAGE_SIZE))
- page_aligned = true;
+ ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ if (ret > 0) {
+ memset(kaddr + offset, 0, size);
+ dax_flush(dax_dev, kaddr + offset, size);
+ }
+ return ret;
+}
- rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
+static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+{
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ u64 length = iomap_length(iter);
+ s64 written = 0;
+
+ /* already zeroed? we're done. */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ do {
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
+ long rc;
+ int id;
+
+ id = dax_read_lock();
+ if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ else
+ rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
+ dax_read_unlock(id);
- id = dax_read_lock();
+ if (rc < 0)
+ return rc;
+ pos += size;
+ length -= size;
+ written += size;
+ if (did_zero)
+ *did_zero = true;
+ } while (length > 0);
- if (page_aligned)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
- else
- rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
+ return written;
+}
- if (!page_aligned) {
- memset(kaddr + offset, 0, size);
- dax_flush(iomap->dax_dev, kaddr + offset, size);
- }
- dax_read_unlock(id);
- return size;
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_DAX | IOMAP_ZERO,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_zero_iter(&iter, did_zero);
+ return ret;
}
+EXPORT_SYMBOL_GPL(dax_zero_range);
+
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
@@ -1169,7 +1204,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
const struct iomap *iomap = &iomi->iomap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
@@ -1203,9 +1237,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
- const sector_t sector = dax_iomap_sector(iomap, pos);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
- pgoff_t pgoff;
void *kaddr;
if (fatal_signal_pending(current)) {
@@ -1213,10 +1246,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
- ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
- if (ret)
- break;
-
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, NULL);
if (map_len < 0) {
@@ -1230,11 +1259,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (map_len > end - pos)
map_len = end - pos;
- /*
- * The userspace address for the memory copy has already been
- * validated via access_ok() in either vfs_read() or
- * vfs_write(), depending on which operation we are doing.
- */
if (iov_iter_rw(iter) == WRITE)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
@@ -1274,6 +1298,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
+ .flags = IOMAP_DAX,
};
loff_t done = 0;
int ret;
@@ -1332,19 +1357,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
const struct iomap_iter *iter)
{
- sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
- unsigned long vaddr = vmf->address;
vm_fault_t ret;
int error = 0;
switch (iter->iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
+ clear_user_highpage(vmf->cow_page, vmf->address);
break;
case IOMAP_MAPPED:
- error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
+ error = copy_cow_page_dax(vmf, iter);
break;
default:
WARN_ON_ONCE(1);
@@ -1430,7 +1452,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
.inode = mapping->host,
.pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
.len = PAGE_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = 0;
void *entry;
@@ -1539,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct iomap_iter iter = {
.inode = mapping->host,
.len = PMD_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = VM_FAULT_FALLBACK;
pgoff_t max_pgoff;
diff --git a/fs/dcache.c b/fs/dcache.c
index cf871a81f4fd..93f4f5ee07bf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -115,10 +115,13 @@ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}
-
-/* Statistics gathering. */
-struct dentry_stat_t dentry_stat = {
- .age_limit = 45,
+struct dentry_stat_t {
+ long nr_dentry;
+ long nr_unused;
+ long age_limit; /* age in seconds */
+ long want_pages; /* pages requested by system */
+ long nr_negative; /* # of unused negative dentries */
+ long dummy; /* Reserved for future use */
};
static DEFINE_PER_CPU(long, nr_dentry);
@@ -126,6 +129,10 @@ static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+/* Statistics gathering. */
+static struct dentry_stat_t dentry_stat = {
+ .age_limit = 45,
+};
/*
* Here we resort to our own counters instead of using generic per-cpu counters
@@ -167,14 +174,32 @@ static long get_nr_dentry_negative(void)
return sum < 0 ? 0 : sum;
}
-int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
dentry_stat.nr_unused = get_nr_dentry_unused();
dentry_stat.nr_negative = get_nr_dentry_negative();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
+
+static struct ctl_table fs_dcache_sysctls[] = {
+ {
+ .procname = "dentry-state",
+ .data = &dentry_stat,
+ .maxlen = 6*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_dentry,
+ },
+ { }
+};
+
+static int __init init_fs_dcache_sysctls(void)
+{
+ register_sysctl_init("fs", fs_dcache_sysctls);
+ return 0;
+}
+fs_initcall(init_fs_dcache_sysctls);
#endif
/*
@@ -1741,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
char *dname;
int err;
- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
+ GFP_KERNEL);
if (!dentry)
return NULL;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 7d162b0efbf0..950c63fa4d0b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -147,7 +147,7 @@ static int debugfs_locked_down(struct inode *inode,
struct file *filp,
const struct file_operations *real_fops)
{
- if ((inode->i_mode & 07777) == 0444 &&
+ if ((inode->i_mode & 07777 & ~0444) == 0 &&
!(filp->f_mode & FMODE_WRITE) &&
!real_fops->unlocked_ioctl &&
!real_fops->compat_ioctl &&
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2f117c57160d..3dcf0b8b4e93 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -450,6 +450,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
+ *
+ * NOTE: it's expected that most callers should _ignore_ the errors returned
+ * by this function. Other debugfs functions handle the fact that the "dentry"
+ * passed to them could be an error and they don't crash in that case.
+ * Drivers should generally work fine even if debugfs fails to init anyway.
*/
struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
@@ -551,6 +556,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
+ *
+ * NOTE: it's expected that most callers should _ignore_ the errors returned
+ * by this function. Other debugfs functions handle the fact that the "dentry"
+ * passed to them could be an error and they don't crash in that case.
+ * Drivers should generally work fine even if debugfs fails to init anyway.
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e5a766d33c..4f25015aa534 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry)
dentry->d_fsdata = NULL;
drop_nlink(dentry->d_inode);
- fsnotify_unlink(d_inode(dentry->d_parent), dentry);
d_drop(dentry);
+ fsnotify_unlink(d_inode(dentry->d_parent), dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b2e86e739d7a..aef06e607b40 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
- blk_qc_t bio_cookie;
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
if (ret > 0 && dio->op == REQ_OP_WRITE)
ret = generic_write_sync(dio->iocb, ret);
- dio->iocb->ki_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret);
}
kmem_cache_free(dio_cache, dio);
@@ -397,18 +396,12 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* bio_alloc() is guaranteed to return a bio when allowed to sleep and
* we request a valid number of vectors.
*/
- bio = bio_alloc(GFP_KERNEL, nr_vecs);
-
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
bio->bi_iter.bi_sector = first_sector;
- bio_set_op_attrs(bio, dio->op, dio->op_flags);
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
bio->bi_end_io = dio_bio_end_io;
-
- bio->bi_write_hint = dio->iocb->ki_hint;
-
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
@@ -438,11 +431,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->bio_disk = bio->bi_bdev->bd_disk;
- if (sdio->submit_io) {
+ if (sdio->submit_io)
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
- dio->bio_cookie = BLK_QC_T_NONE;
- } else
- dio->bio_cookie = submit_bio(bio);
+ else
+ submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -481,9 +473,7 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
- blk_io_schedule();
+ blk_io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1214,8 +1204,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
- if (iocb->ki_flags & IOCB_HIPRI)
- dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 283c7b94edda..bfac462dd3e8 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -9,6 +9,8 @@
*******************************************************************************
******************************************************************************/
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lock.h"
#include "user.h"
@@ -254,10 +256,12 @@ void dlm_callback_work(struct work_struct *work)
continue;
} else if (callbacks[i].flags & DLM_CB_BAST) {
bastfn(lkb->lkb_astparam, callbacks[i].mode);
+ trace_dlm_bast(ls, lkb, callbacks[i].mode);
} else if (callbacks[i].flags & DLM_CB_CAST) {
lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
castfn(lkb->lkb_astparam);
+ trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
}
}
@@ -295,7 +299,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
void dlm_callback_resume(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
- int count = 0;
+ int count = 0, sum = 0;
+ bool empty;
clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
@@ -311,14 +316,17 @@ more:
if (count == MAX_CB_QUEUE)
break;
}
+ empty = list_empty(&ls->ls_cb_delay);
mutex_unlock(&ls->ls_cb_mutex);
- if (count)
- log_rinfo(ls, "dlm_callback_resume %d", count);
- if (count == MAX_CB_QUEUE) {
+ sum += count;
+ if (!empty) {
count = 0;
cond_resched();
goto more;
}
+
+ if (sum)
+ log_rinfo(ls, "%s %d", __func__, sum);
}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 47e9d57e4cae..8fb04ebbafb5 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -635,6 +635,35 @@ static int table_open2(struct inode *inode, struct file *file)
return 0;
}
+static ssize_t table_write2(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ int n, len, lkb_nodeid, lkb_status, error;
+ char name[DLM_RESNAME_MAXLEN + 1] = {};
+ struct dlm_ls *ls = seq->private;
+ unsigned int lkb_flags;
+ char buf[256] = {};
+ uint32_t lkb_id;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d",
+ &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status);
+ if (n != 5)
+ return -EINVAL;
+
+ len = strnlen(name, DLM_RESNAME_MAXLEN);
+ error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags,
+ lkb_nodeid, lkb_status);
+ if (error)
+ return error;
+
+ return count;
+}
+
static int table_open3(struct inode *inode, struct file *file)
{
struct seq_file *seq;
@@ -675,6 +704,7 @@ static const struct file_operations format2_fops = {
.owner = THIS_MODULE,
.open = table_open2,
.read = seq_read,
+ .write = table_write2,
.llseek = seq_lseek,
.release = seq_release
};
@@ -724,10 +754,35 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
return rv;
}
+static ssize_t waiters_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ int mstype, to_nodeid;
+ char buf[128] = {};
+ uint32_t lkb_id;
+ int n, error;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid);
+ if (n != 3)
+ return -EINVAL;
+
+ error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid);
+ if (error)
+ return error;
+
+ return count;
+}
+
static const struct file_operations waiters_fops = {
.owner = THIS_MODULE,
.open = simple_open,
.read = waiters_read,
+ .write = waiters_write,
.llseek = default_llseek,
};
@@ -768,6 +823,42 @@ static int dlm_version_show(struct seq_file *file, void *offset)
}
DEFINE_SHOW_ATTRIBUTE(dlm_version);
+static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ void *buf;
+ int ret;
+
+ if (count > PAGE_SIZE || count < sizeof(struct dlm_header))
+ return -EINVAL;
+
+ buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, user_buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count);
+ if (ret)
+ goto out;
+
+ kfree(buf);
+ return count;
+
+out:
+ kfree(buf);
+ return ret;
+}
+
+static const struct file_operations dlm_rawmsg_fops = {
+ .open = simple_open,
+ .write = dlm_rawmsg_write,
+ .llseek = no_llseek,
+};
+
void *dlm_create_debug_comms_file(int nodeid, void *data)
{
struct dentry *d_node;
@@ -782,6 +873,7 @@ void *dlm_create_debug_comms_file(int nodeid, void *data)
debugfs_create_file("send_queue_count", 0444, d_node, data,
&dlm_send_queue_cnt_fops);
debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops);
+ debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops);
return d_node;
}
@@ -809,7 +901,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name);
ls->ls_debug_locks_dentry = debugfs_create_file(name,
- S_IFREG | S_IRUGO,
+ 0644,
dlm_root,
ls,
&format2_fops);
@@ -840,7 +932,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
ls->ls_debug_waiters_dentry = debugfs_create_file(name,
- S_IFREG | S_IRUGO,
+ 0644,
dlm_root,
ls,
&waiters_fops);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 45ebbe602bbf..b6692f81ec83 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -84,8 +84,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
for (;;) {
int left;
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto out_free;
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5f57538b5d45..74a9590a4dd5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -41,12 +41,6 @@
#include <linux/dlm.h>
#include "config.h"
-/* Size of the temp buffer midcomms allocates on the stack.
- We try to make this large enough so most messages fit.
- FIXME: should sctp make this unnecessary? */
-
-#define DLM_INBUF_LEN 148
-
struct dlm_ls;
struct dlm_lkb;
struct dlm_rsb;
@@ -554,8 +548,9 @@ struct dlm_ls {
uint32_t ls_generation;
uint32_t ls_exflags;
int ls_lvblen;
- int ls_count; /* refcount of processes in
+ atomic_t ls_count; /* refcount of processes in
the dlm using this ls */
+ wait_queue_head_t ls_count_wait;
int ls_create_count; /* create/release refcount */
unsigned long ls_flags; /* LSFL_ */
unsigned long ls_scan_time;
@@ -581,6 +576,7 @@ struct dlm_ls {
struct list_head ls_new_rsb; /* new rsb structs */
spinlock_t ls_remove_spin;
+ wait_queue_head_t ls_remove_wait;
char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
int ls_remove_len;
@@ -632,6 +628,8 @@ struct dlm_ls {
struct rw_semaphore ls_in_recovery; /* block local requests */
struct rw_semaphore ls_recv_active; /* block dlm_recv */
struct list_head ls_requestqueue;/* queue remote requests */
+ atomic_t ls_requestqueue_cnt;
+ wait_queue_head_t ls_requestqueue_wait;
struct mutex ls_requestqueue_mutex;
struct dlm_rcom *ls_recover_buf;
int ls_recover_nodeid; /* for debugging */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c502c065d007..bdb51d209ba2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -53,6 +53,8 @@
R: do_xxxx()
L: receive_xxxx_reply() <- R: send_xxxx_reply()
*/
+#include <trace/events/dlm.h>
+
#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -1178,7 +1180,8 @@ static void detach_lkb(struct dlm_lkb *lkb)
}
}
-static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
+ int start, int end)
{
struct dlm_lkb *lkb;
int rv;
@@ -1199,7 +1202,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
idr_preload(GFP_NOFS);
spin_lock(&ls->ls_lkbidr_spin);
- rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT);
+ rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
if (rv >= 0)
lkb->lkb_id = rv;
spin_unlock(&ls->ls_lkbidr_spin);
@@ -1215,6 +1218,11 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
return 0;
}
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ return _create_lkb(ls, lkb_ret, 1, 0);
+}
+
static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
@@ -1618,21 +1626,24 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
}
/* If there's an rsb for the same resource being removed, ensure
- that the remove message is sent before the new lookup message.
- It should be rare to need a delay here, but if not, then it may
- be worthwhile to add a proper wait mechanism rather than a delay. */
+ * that the remove message is sent before the new lookup message.
+ */
+
+#define DLM_WAIT_PENDING_COND(ls, r) \
+ (ls->ls_remove_len && \
+ !rsb_cmp(r, ls->ls_remove_name, \
+ ls->ls_remove_len))
static void wait_pending_remove(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
restart:
spin_lock(&ls->ls_remove_spin);
- if (ls->ls_remove_len &&
- !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) {
+ if (DLM_WAIT_PENDING_COND(ls, r)) {
log_debug(ls, "delay lookup for remove dir %d %s",
- r->res_dir_nodeid, r->res_name);
+ r->res_dir_nodeid, r->res_name);
spin_unlock(&ls->ls_remove_spin);
- msleep(1);
+ wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
goto restart;
}
spin_unlock(&ls->ls_remove_spin);
@@ -1784,6 +1795,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
+ wake_up(&ls->ls_remove_wait);
send_remove(r);
@@ -3437,6 +3449,8 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error)
goto out;
+ trace_dlm_lock_start(ls, lkb, mode, flags);
+
error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
astarg, bast, &args);
if (error)
@@ -3450,6 +3464,8 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
+ trace_dlm_lock_end(ls, lkb, mode, flags, error);
+
if (convert || error)
__put_lkb(ls, lkb);
if (error == -EAGAIN || error == -EDEADLK)
@@ -3481,6 +3497,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
error = set_unlock_args(flags, astarg, &args);
if (error)
goto out_put;
@@ -3495,6 +3513,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -3973,6 +3993,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
int from = ms->m_header.h_nodeid;
int error = 0;
+ /* currently mixing of user/kernel locks are not supported */
+ if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) {
+ log_error(lkb->lkb_resource->res_ls,
+ "got user dlm message for a kernel lock");
+ error = -EINVAL;
+ goto out;
+ }
+
switch (ms->m_type) {
case DLM_MSG_CONVERT:
case DLM_MSG_UNLOCK:
@@ -4001,6 +4029,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
error = -EINVAL;
}
+out:
if (error)
log_error(lkb->lkb_resource->res_ls,
"ignore invalid message %d from %d %x %x %x %d",
@@ -4050,6 +4079,7 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
+ wake_up(&ls->ls_remove_wait);
rv = _create_message(ls, sizeof(struct dlm_message) + len,
dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
@@ -6301,3 +6331,64 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
return error;
}
+/* debug functionality */
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_flags, int lkb_status)
+{
+ struct dlm_lksb *lksb;
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ /* we currently can't set a valid user lock */
+ if (lkb_flags & DLM_IFL_USER)
+ return -EOPNOTSUPP;
+
+ lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
+ if (!lksb)
+ return -ENOMEM;
+
+ error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1);
+ if (error) {
+ kfree(lksb);
+ return error;
+ }
+
+ lkb->lkb_flags = lkb_flags;
+ lkb->lkb_nodeid = lkb_nodeid;
+ lkb->lkb_lksb = lksb;
+ /* user specific pointer, just don't have it NULL for kernel locks */
+ if (~lkb_flags & DLM_IFL_USER)
+ lkb->lkb_astparam = (void *)0xDEADBEEF;
+
+ error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
+ if (error) {
+ kfree(lksb);
+ __put_lkb(ls, lkb);
+ return error;
+ }
+
+ lock_rsb(r);
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, lkb_status);
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
+}
+
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, lkb_id, &lkb);
+ if (error)
+ return error;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
+ dlm_put_lkb(lkb);
+ return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 456c6ec3ef6f..252a5898f908 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -58,6 +58,10 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
int nodeid, int pid);
int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_flags, int lkb_status);
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid);
static inline int is_master(struct dlm_rsb *r)
{
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 10eddfa6c3d7..0d3833a124a3 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -216,8 +216,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
return ls->ls_uevent_result;
}
-static int dlm_uevent(struct kset *kset, struct kobject *kobj,
- struct kobj_uevent_env *env)
+static int dlm_uevent(struct kobject *kobj, struct kobj_uevent_env *env)
{
struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
@@ -314,7 +313,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_global_id == id) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -331,7 +330,7 @@ struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_local_handle == lockspace) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -348,7 +347,7 @@ struct dlm_ls *dlm_find_lockspace_device(int minor)
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_device.minor == minor) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -360,24 +359,24 @@ struct dlm_ls *dlm_find_lockspace_device(int minor)
void dlm_put_lockspace(struct dlm_ls *ls)
{
- spin_lock(&lslist_lock);
- ls->ls_count--;
- spin_unlock(&lslist_lock);
+ if (atomic_dec_and_test(&ls->ls_count))
+ wake_up(&ls->ls_count_wait);
}
static void remove_lockspace(struct dlm_ls *ls)
{
- for (;;) {
- spin_lock(&lslist_lock);
- if (ls->ls_count == 0) {
- WARN_ON(ls->ls_create_count != 0);
- list_del(&ls->ls_list);
- spin_unlock(&lslist_lock);
- return;
- }
+retry:
+ wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0);
+
+ spin_lock(&lslist_lock);
+ if (atomic_read(&ls->ls_count) != 0) {
spin_unlock(&lslist_lock);
- ssleep(1);
+ goto retry;
}
+
+ WARN_ON(ls->ls_create_count != 0);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
}
static int threads_start(void)
@@ -481,7 +480,8 @@ static int new_lockspace(const char *name, const char *cluster,
memcpy(ls->ls_name, name, namelen);
ls->ls_namelen = namelen;
ls->ls_lvblen = lvblen;
- ls->ls_count = 0;
+ atomic_set(&ls->ls_count, 0);
+ init_waitqueue_head(&ls->ls_count_wait);
ls->ls_flags = 0;
ls->ls_scan_time = jiffies;
@@ -511,6 +511,7 @@ static int new_lockspace(const char *name, const char *cluster,
}
spin_lock_init(&ls->ls_remove_spin);
+ init_waitqueue_head(&ls->ls_remove_wait);
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
@@ -564,6 +565,8 @@ static int new_lockspace(const char *name, const char *cluster,
init_rwsem(&ls->ls_in_recovery);
init_rwsem(&ls->ls_recv_active);
INIT_LIST_HEAD(&ls->ls_requestqueue);
+ atomic_set(&ls->ls_requestqueue_cnt, 0);
+ init_waitqueue_head(&ls->ls_requestqueue_wait);
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
@@ -868,7 +871,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
* until this returns.
*
* Force has 4 possible values:
- * 0 - don't destroy locksapce if it has any LKBs
+ * 0 - don't destroy lockspace if it has any LKBs
* 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
* 2 - destroy lockspace regardless of LKBs
* 3 - destroy lockspace as part of a forced shutdown
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 8f715c620e1f..e284d696c1fd 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -53,9 +53,12 @@
#include <net/sctp/sctp.h>
#include <net/ipv6.h>
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "midcomms.h"
+#include "memory.h"
#include "config.h"
#define NEEDED_RMEM (4*1024*1024)
@@ -84,7 +87,6 @@ struct connection {
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
atomic_t writequeue_cnt;
- struct mutex wq_alloc;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -189,6 +191,24 @@ static const struct dlm_proto_ops *dlm_proto_ops;
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void writequeue_entry_ctor(void *data)
+{
+ struct writequeue_entry *entry = data;
+
+ INIT_LIST_HEAD(&entry->msgs);
+}
+
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void)
+{
+ return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry),
+ 0, 0, writequeue_entry_ctor);
+}
+
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void)
+{
+ return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL);
+}
+
/* need to held writequeue_lock */
static struct writequeue_entry *con_next_wq(struct connection *con)
{
@@ -199,7 +219,10 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list);
- if (e->len == 0)
+ /* if len is zero nothing is to send, if there are users filling
+ * buffers we wait until the users are done so we can send more.
+ */
+ if (e->users || e->len == 0)
return NULL;
return e;
@@ -265,8 +288,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return NULL;
}
- mutex_init(&con->wq_alloc);
-
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
* race on multiple cpu's. Instead of locking hot path __find_con()
@@ -486,11 +507,9 @@ static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
queue_work(recv_workqueue, &con->rwork);
- read_unlock_bh(&sk->sk_callback_lock);
}
static void lowcomms_listen_data_ready(struct sock *sk)
@@ -505,15 +524,14 @@ static void lowcomms_write_space(struct sock *sk)
{
struct connection *con;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (!con)
- goto out;
+ return;
if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
log_print("successful connected to node %d", con->nodeid);
queue_work(send_workqueue, &con->swork);
- goto out;
+ return;
}
clear_bit(SOCK_NOSPACE, &con->sock->flags);
@@ -524,8 +542,6 @@ static void lowcomms_write_space(struct sock *sk)
}
queue_work(send_workqueue, &con->swork);
-out:
- read_unlock_bh(&sk->sk_callback_lock);
}
static inline void lowcomms_connect_sock(struct connection *con)
@@ -592,42 +608,41 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
static void lowcomms_error_report(struct sock *sk)
{
struct connection *con;
- struct sockaddr_storage saddr;
void (*orig_report)(struct sock *) = NULL;
+ struct inet_sock *inet;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con == NULL)
goto out;
orig_report = listen_sock.sk_error_report;
- if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) {
- printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d, port %d, "
- "sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, dlm_config.ci_tcp_port,
- sk->sk_err, sk->sk_err_soft);
- } else if (saddr.ss_family == AF_INET) {
- struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
+ inet = inet_sk(sk);
+ switch (sk->sk_family) {
+ case AF_INET:
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d at %pI4, port %d, "
+ "sending to node %d at %pI4, dport %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, &sin4->sin_addr.s_addr,
- dlm_config.ci_tcp_port, sk->sk_err,
+ con->nodeid, &inet->inet_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
sk->sk_err_soft);
- } else {
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr;
-
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d at %u.%u.%u.%u, "
- "port %d, sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, sin6->sin6_addr.s6_addr32[0],
- sin6->sin6_addr.s6_addr32[1],
- sin6->sin6_addr.s6_addr32[2],
- sin6->sin6_addr.s6_addr32[3],
- dlm_config.ci_tcp_port, sk->sk_err,
+ "sending to node %d at %pI6c, "
+ "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
sk->sk_err_soft);
+ break;
+#endif
+ default:
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "invalid socket family %d set, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ sk->sk_family, sk->sk_err, sk->sk_err_soft);
+ goto out;
}
/* below sendcon only handling */
@@ -646,7 +661,6 @@ static void lowcomms_error_report(struct sock *sk)
queue_work(send_workqueue, &con->swork);
out:
- read_unlock_bh(&sk->sk_callback_lock);
if (orig_report)
orig_report(sk);
}
@@ -666,20 +680,20 @@ static void restore_callbacks(struct socket *sock)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = listen_sock.sk_data_ready;
sk->sk_state_change = listen_sock.sk_state_change;
sk->sk_write_space = listen_sock.sk_write_space;
sk->sk_error_report = listen_sock.sk_error_report;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
static void add_listen_sock(struct socket *sock, struct listen_connection *con)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
save_listen_callbacks(sock);
con->sock = sock;
@@ -687,7 +701,7 @@ static void add_listen_sock(struct socket *sock, struct listen_connection *con)
sk->sk_allocation = GFP_NOFS;
/* Install a data_ready callback */
sk->sk_data_ready = lowcomms_listen_data_ready;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
/* Make a socket active */
@@ -695,7 +709,7 @@ static void add_sock(struct socket *sock, struct connection *con)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
con->sock = sock;
sk->sk_user_data = con;
@@ -705,7 +719,7 @@ static void add_sock(struct socket *sock, struct connection *con)
sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
sk->sk_error_report = lowcomms_error_report;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -733,7 +747,7 @@ static void dlm_page_release(struct kref *kref)
ref);
__free_page(e->page);
- kfree(e);
+ dlm_free_writequeue(e);
}
static void dlm_msg_release(struct kref *kref)
@@ -741,7 +755,7 @@ static void dlm_msg_release(struct kref *kref)
struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
kref_put(&msg->entry->ref, dlm_page_release);
- kfree(msg);
+ dlm_free_msg(msg);
}
static void free_entry(struct writequeue_entry *e)
@@ -925,6 +939,7 @@ static int receive_from_sock(struct connection *con)
msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
if (ret == -EAGAIN)
break;
else if (ret <= 0)
@@ -1013,10 +1028,28 @@ static int accept_from_sock(struct listen_connection *con)
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len);
if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) {
- unsigned char *b=(unsigned char *)&peeraddr;
- log_print("connect from non cluster node");
- print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
- b, sizeof(struct sockaddr_storage));
+ switch (peeraddr.ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr;
+
+ log_print("connect from non cluster IPv4 node %pI4",
+ &sin->sin_addr);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr;
+
+ log_print("connect from non cluster IPv6 node %pI6c",
+ &sin6->sin6_addr);
+ break;
+ }
+#endif
+ default:
+ log_print("invalid family from non cluster node");
+ break;
+ }
+
sock_release(newsock);
return -1;
}
@@ -1177,33 +1210,33 @@ static void deinit_local(void)
kfree(dlm_local_addr[i]);
}
-static struct writequeue_entry *new_writequeue_entry(struct connection *con,
- gfp_t allocation)
+static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
- entry = kzalloc(sizeof(*entry), allocation);
+ entry = dlm_allocate_writequeue();
if (!entry)
return NULL;
- entry->page = alloc_page(allocation | __GFP_ZERO);
+ entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!entry->page) {
- kfree(entry);
+ dlm_free_writequeue(entry);
return NULL;
}
+ entry->offset = 0;
+ entry->len = 0;
+ entry->end = 0;
+ entry->dirty = false;
entry->con = con;
entry->users = 1;
kref_init(&entry->ref);
- INIT_LIST_HEAD(&entry->msgs);
-
return entry;
}
static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
- gfp_t allocation, char **ppc,
- void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ char **ppc, void (*cb)(void *data),
+ void *data)
{
struct writequeue_entry *e;
@@ -1215,74 +1248,54 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
*ppc = page_address(e->page) + e->end;
if (cb)
- cb(mh);
+ cb(data);
e->end += len;
e->users++;
- spin_unlock(&con->writequeue_lock);
-
- return e;
+ goto out;
}
}
- spin_unlock(&con->writequeue_lock);
- e = new_writequeue_entry(con, allocation);
+ e = new_writequeue_entry(con);
if (!e)
- return NULL;
+ goto out;
kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
atomic_inc(&con->writequeue_cnt);
-
- spin_lock(&con->writequeue_lock);
if (cb)
- cb(mh);
+ cb(data);
list_add_tail(&e->list, &con->writequeue);
- spin_unlock(&con->writequeue_lock);
+out:
+ spin_unlock(&con->writequeue_lock);
return e;
};
static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
gfp_t allocation, char **ppc,
- void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ void (*cb)(void *data),
+ void *data)
{
struct writequeue_entry *e;
struct dlm_msg *msg;
- bool sleepable;
- msg = kzalloc(sizeof(*msg), allocation);
+ msg = dlm_allocate_msg(allocation);
if (!msg)
return NULL;
- /* this mutex is being used as a wait to avoid multiple "fast"
- * new writequeue page list entry allocs in new_wq_entry in
- * normal operation which is sleepable context. Without it
- * we could end in multiple writequeue entries with one
- * dlm message because multiple callers were waiting at
- * the writequeue_lock in new_wq_entry().
- */
- sleepable = gfpflags_normal_context(allocation);
- if (sleepable)
- mutex_lock(&con->wq_alloc);
-
kref_init(&msg->ref);
- e = new_wq_entry(con, len, allocation, ppc, cb, mh);
+ e = new_wq_entry(con, len, ppc, cb, data);
if (!e) {
- if (sleepable)
- mutex_unlock(&con->wq_alloc);
-
- kfree(msg);
+ dlm_free_msg(msg);
return NULL;
}
- if (sleepable)
- mutex_unlock(&con->wq_alloc);
-
+ msg->retransmit = false;
+ msg->orig_msg = NULL;
msg->ppc = *ppc;
msg->len = len;
msg->entry = e;
@@ -1291,8 +1304,8 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
}
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
- char **ppc, void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ char **ppc, void (*cb)(void *data),
+ void *data)
{
struct connection *con;
struct dlm_msg *msg;
@@ -1313,7 +1326,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
return NULL;
}
- msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh);
+ msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data);
if (!msg) {
srcu_read_unlock(&connections_srcu, idx);
return NULL;
@@ -1403,7 +1416,6 @@ static void send_to_sock(struct connection *con)
if (!e)
break;
- e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
BUG_ON(len == 0 && e->users == 0);
@@ -1411,6 +1423,7 @@ static void send_to_sock(struct connection *con)
ret = kernel_sendpage(con->sock, e->page, offset, len,
msg_flags);
+ trace_dlm_send(con->nodeid, ret);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
@@ -1680,9 +1693,9 @@ static void _stop_conn(struct connection *con, bool and_other)
set_bit(CF_READ_PENDING, &con->flags);
set_bit(CF_WRITE_PENDING, &con->flags);
if (con->sock && con->sock->sk) {
- write_lock_bh(&con->sock->sk->sk_callback_lock);
+ lock_sock(con->sock->sk);
con->sock->sk->sk_user_data = NULL;
- write_unlock_bh(&con->sock->sk->sk_callback_lock);
+ release_sock(con->sock->sk);
}
if (con->othercon && and_other)
_stop_conn(con->othercon, false);
@@ -1775,7 +1788,7 @@ static int dlm_listen_for_all(void)
result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
- log_print("Can't create comms socket, check SCTP is loaded");
+ log_print("Can't create comms socket: %d", result);
goto out;
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 4ccae07cf005..29369feea991 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -38,8 +38,8 @@ void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
- char **ppc, void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh);
+ char **ppc, void (*cb)(void *data),
+ void *data);
void dlm_lowcomms_commit_msg(struct dlm_msg *msg);
void dlm_lowcomms_put_msg(struct dlm_msg *msg);
int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
@@ -47,6 +47,8 @@ int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
void dlm_midcomms_receive_done(int nodeid);
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void);
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void);
#endif /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index afc66a1346d3..1c5be4b70ac1 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -19,6 +19,9 @@
#include "config.h"
#include "lowcomms.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/dlm.h>
+
static int __init init_dlm(void)
{
int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 731d489aa323..61f906e705db 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -442,8 +442,7 @@ static int ping_members(struct dlm_ls *ls)
int error = 0;
list_for_each_entry(memb, &ls->ls_nodes, list) {
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
break;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 5918f4d39586..ce35c3c19aeb 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -10,32 +10,61 @@
******************************************************************************/
#include "dlm_internal.h"
+#include "midcomms.h"
+#include "lowcomms.h"
#include "config.h"
#include "memory.h"
+static struct kmem_cache *writequeue_cache;
+static struct kmem_cache *mhandle_cache;
+static struct kmem_cache *msg_cache;
static struct kmem_cache *lkb_cache;
static struct kmem_cache *rsb_cache;
int __init dlm_memory_init(void)
{
+ writequeue_cache = dlm_lowcomms_writequeue_cache_create();
+ if (!writequeue_cache)
+ goto out;
+
+ mhandle_cache = dlm_midcomms_cache_create();
+ if (!mhandle_cache)
+ goto mhandle;
+
lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
__alignof__(struct dlm_lkb), 0, NULL);
if (!lkb_cache)
- return -ENOMEM;
+ goto lkb;
+
+ msg_cache = dlm_lowcomms_msg_cache_create();
+ if (!msg_cache)
+ goto msg;
rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
__alignof__(struct dlm_rsb), 0, NULL);
- if (!rsb_cache) {
- kmem_cache_destroy(lkb_cache);
- return -ENOMEM;
- }
+ if (!rsb_cache)
+ goto rsb;
return 0;
+
+rsb:
+ kmem_cache_destroy(msg_cache);
+msg:
+ kmem_cache_destroy(lkb_cache);
+lkb:
+ kmem_cache_destroy(mhandle_cache);
+mhandle:
+ kmem_cache_destroy(writequeue_cache);
+out:
+ return -ENOMEM;
}
void dlm_memory_exit(void)
{
+ kmem_cache_destroy(writequeue_cache);
+ kmem_cache_destroy(mhandle_cache);
+ kmem_cache_destroy(msg_cache);
kmem_cache_destroy(lkb_cache);
kmem_cache_destroy(rsb_cache);
}
@@ -89,3 +118,32 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
kmem_cache_free(lkb_cache, lkb);
}
+struct dlm_mhandle *dlm_allocate_mhandle(void)
+{
+ return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+}
+
+void dlm_free_mhandle(struct dlm_mhandle *mhandle)
+{
+ kmem_cache_free(mhandle_cache, mhandle);
+}
+
+struct writequeue_entry *dlm_allocate_writequeue(void)
+{
+ return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC);
+}
+
+void dlm_free_writequeue(struct writequeue_entry *writequeue)
+{
+ kmem_cache_free(writequeue_cache, writequeue);
+}
+
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation)
+{
+ return kmem_cache_alloc(msg_cache, allocation);
+}
+
+void dlm_free_msg(struct dlm_msg *msg)
+{
+ kmem_cache_free(msg_cache, msg);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 4f218ea4b187..7bd3f1a391ca 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,6 +20,12 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
void dlm_free_lkb(struct dlm_lkb *l);
char *dlm_allocate_lvb(struct dlm_ls *ls);
void dlm_free_lvb(char *l);
+struct dlm_mhandle *dlm_allocate_mhandle(void);
+void dlm_free_mhandle(struct dlm_mhandle *mhandle);
+struct writequeue_entry *dlm_allocate_writequeue(void);
+void dlm_free_writequeue(struct writequeue_entry *writequeue);
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
+void dlm_free_msg(struct dlm_msg *msg);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 7ae39ec8d9b0..3635e42b0669 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -137,6 +137,7 @@
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
+#include "memory.h"
#include "lock.h"
#include "util.h"
#include "midcomms.h"
@@ -220,6 +221,12 @@ DEFINE_STATIC_SRCU(nodes_srcu);
*/
static DEFINE_MUTEX(close_lock);
+struct kmem_cache *dlm_midcomms_cache_create(void)
+{
+ return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle),
+ 0, 0, NULL);
+}
+
static inline const char *dlm_state_str(int state)
{
switch (state) {
@@ -279,7 +286,7 @@ static void dlm_mhandle_release(struct rcu_head *rcu)
struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu);
dlm_lowcomms_put_msg(mh->msg);
- kfree(mh);
+ dlm_free_mhandle(mh);
}
static void dlm_mhandle_delete(struct midcomms_node *node,
@@ -909,11 +916,11 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
- switch (le32_to_cpu(hd->h_version)) {
- case DLM_VERSION_3_1:
+ switch (hd->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
break;
- case DLM_VERSION_3_2:
+ case cpu_to_le32(DLM_VERSION_3_2):
dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid);
break;
default:
@@ -969,7 +976,7 @@ void dlm_midcomms_receive_done(int nodeid)
spin_unlock(&node->state_lock);
/* do nothing FIN has it's own ack send */
break;
- };
+ }
srcu_read_unlock(&nodes_srcu, idx);
}
@@ -1020,8 +1027,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
header_out(&opts->o_header);
}
-static void midcomms_new_msg_cb(struct dlm_mhandle *mh)
+static void midcomms_new_msg_cb(void *data)
{
+ struct dlm_mhandle *mh = data;
+
atomic_inc(&mh->node->send_queue_cnt);
spin_lock(&mh->node->send_queue_lock);
@@ -1071,10 +1080,12 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
/* this is a bug, however we going on and hope it will be resolved */
WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
- mh = kzalloc(sizeof(*mh), GFP_NOFS);
+ mh = dlm_allocate_mhandle();
if (!mh)
goto err;
+ mh->committed = false;
+ mh->ack_rcv = NULL;
mh->idx = idx;
mh->node = node;
@@ -1083,7 +1094,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc,
NULL, NULL);
if (!msg) {
- kfree(mh);
+ dlm_free_mhandle(mh);
goto err;
}
@@ -1092,13 +1103,13 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
ppc);
if (!msg) {
- kfree(mh);
+ dlm_free_mhandle(mh);
goto err;
}
break;
default:
- kfree(mh);
+ dlm_free_mhandle(mh);
WARN_ON(1);
goto err;
}
@@ -1134,7 +1145,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
dlm_lowcomms_commit_msg(mh->msg);
dlm_lowcomms_put_msg(mh->msg);
/* mh is not part of rcu list in this case */
- kfree(mh);
+ dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
dlm_midcomms_commit_msg_3_2(mh);
@@ -1231,7 +1242,7 @@ void dlm_midcomms_add_member(int nodeid)
}
node->users++;
- pr_debug("users inc count %d\n", node->users);
+ pr_debug("node %d users inc count %d\n", nodeid, node->users);
spin_unlock(&node->state_lock);
srcu_read_unlock(&nodes_srcu, idx);
@@ -1254,7 +1265,7 @@ void dlm_midcomms_remove_member(int nodeid)
spin_lock(&node->state_lock);
node->users--;
- pr_debug("users dec count %d\n", node->users);
+ pr_debug("node %d users dec count %d\n", nodeid, node->users);
/* hitting users count to zero means the
* other side is running dlm_midcomms_stop()
@@ -1425,3 +1436,51 @@ int dlm_midcomms_close(int nodeid)
return ret;
}
+
+/* debug functionality to send raw dlm msg from user space */
+struct dlm_rawmsg_data {
+ struct midcomms_node *node;
+ void *buf;
+};
+
+static void midcomms_new_rawmsg_cb(void *data)
+{
+ struct dlm_rawmsg_data *rd = data;
+ struct dlm_header *h = rd->buf;
+
+ switch (h->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
+ break;
+ default:
+ switch (h->h_cmd) {
+ case DLM_OPTS:
+ if (!h->u.h_seq)
+ h->u.h_seq = rd->node->seq_send++;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+}
+
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen)
+{
+ struct dlm_rawmsg_data rd;
+ struct dlm_msg *msg;
+ char *msgbuf;
+
+ rd.node = node;
+ rd.buf = buf;
+
+ msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS,
+ &msgbuf, midcomms_new_rawmsg_cb, &rd);
+ if (!msg)
+ return -ENOMEM;
+
+ memcpy(msgbuf, buf, buflen);
+ dlm_lowcomms_commit_msg(msg);
+ return 0;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 579abc6929be..82bcd9661922 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -28,6 +28,9 @@ const char *dlm_midcomms_state(struct midcomms_node *node);
unsigned long dlm_midcomms_flags(struct midcomms_node *node);
int dlm_midcomms_send_queue_cnt(struct midcomms_node *node);
uint32_t dlm_midcomms_version(struct midcomms_node *node);
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen);
+struct kmem_cache *dlm_midcomms_cache_create(void);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 6cba86470278..5821b777a1a7 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -601,7 +601,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
spin_lock(&ls->ls_recover_lock);
status = ls->ls_recover_status;
- stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+ stop = dlm_recovery_stopped(ls);
seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 97d052cea5a9..a55dfce705dd 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -124,8 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_recover_waiters_pre(ls);
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto fail;
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index e89e0ff8bfa3..ccb5307c21e9 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,6 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->nodeid = nodeid;
memcpy(&e->request, ms, ms->m_header.h_length);
+ atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
list_add_tail(&e->list, &ls->ls_requestqueue);
mutex_unlock(&ls->ls_requestqueue_mutex);
@@ -89,6 +90,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
mutex_lock(&ls->ls_requestqueue_mutex);
list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
kfree(e);
if (dlm_locking_stopped(ls)) {
@@ -115,14 +118,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
void dlm_wait_requestqueue(struct dlm_ls *ls)
{
- for (;;) {
- mutex_lock(&ls->ls_requestqueue_mutex);
- if (list_empty(&ls->ls_requestqueue))
- break;
- mutex_unlock(&ls->ls_requestqueue_mutex);
- schedule();
- }
- mutex_unlock(&ls->ls_requestqueue_mutex);
+ wait_event(ls->ls_requestqueue_wait,
+ atomic_read(&ls->ls_requestqueue_cnt) == 0);
}
static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
@@ -130,7 +127,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
uint32_t type = ms->m_type;
/* the ls is being cleaned up and freed by release_lockspace */
- if (!ls->ls_count)
+ if (!atomic_read(&ls->ls_count))
return 1;
if (dlm_is_removed(ls, nodeid))
@@ -161,6 +158,8 @@ void dlm_purge_requestqueue(struct dlm_ls *ls)
if (purge_request(ls, ms, e->nodeid)) {
list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
kfree(e);
}
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d66bbd2df191..2dd23a82e0de 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -537,7 +537,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out_free;
}
- if (mnt_user_ns(path.mnt) != &init_user_ns) {
+ if (is_idmapped_mnt(path.mnt)) {
rc = -EINVAL;
printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n");
goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 7d85e64ea62f..9ad61b582f07 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -540,12 +540,13 @@ const struct address_space_operations ecryptfs_aops = {
* XXX: This is pretty broken for multiple reasons: ecryptfs does not
* actually use buffer_heads, and ecryptfs will crash without
* CONFIG_BLOCK. But it matches the behavior before the default for
- * address_space_operations without the ->set_page_dirty method was
+ * address_space_operations without the ->dirty_folio method was
* cleaned up, so this is the best we can do without maintainer
* feedback.
*/
#ifdef CONFIG_BLOCK
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
#endif
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 39116af0390f..0b1c878317ab 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
struct ecryptfs_inode_info *inode_info;
struct inode *inode = NULL;
- inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
+ inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 62b155b9366b..b287f47c165b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep;
static struct inode *efs_alloc_inode(struct super_block *sb)
{
struct efs_inode_info *ei;
- ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 14b747026742..f57255ab88ed 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -6,16 +6,22 @@ config EROFS_FS
select FS_IOMAP
select LIBCRC32C
help
- EROFS (Enhanced Read-Only File System) is a lightweight
- read-only file system with modern designs (eg. page-sized
- blocks, inline xattrs/data, etc.) for scenarios which need
- high-performance read-only requirements, e.g. Android OS
- for mobile phones and LIVECDs.
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
- It also provides fixed-sized output compression support,
- which improves storage density, keeps relatively higher
- compression ratios, which is more useful to achieve high
- performance for embedded devices with limited memory.
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
If unsure, say N.
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
Enable fixed-sized output compression for EROFS.
If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
+ depends on EROFS_FS_ZIP
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 1f9aced49070..8a3317e38e5a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3701c72bacb2..19e6c56a9f47 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -8,16 +8,11 @@
#include "internal.h"
-enum {
- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
- Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
- unsigned short pageofs_out;
+ unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
/* indicate the algorithm will be used for decompression */
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
bool inplace_io, partial_decoding;
};
+struct z_erofs_decompressor {
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
+};
+
/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
return true;
}
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
struct page *page)
{
if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,24 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
put_page(page);
} else {
/* follow the pcluster rule above. */
- set_page_private(page, 0);
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
}
return true;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize);
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
+ struct page **pagepool);
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9db829715652..780db1e5f4b7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -9,37 +9,77 @@
#include <linux/dax.h>
#include <trace/events/erofs.h>
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
- struct page *page;
-
- page = read_cache_page_gfp(mapping, blkaddr,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- /* should already be PageUptodate */
- if (!IS_ERR(page))
- lock_page(page);
- return page;
+ if (buf->kmap_type == EROFS_KMAP)
+ kunmap(buf->page);
+ else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
+ kunmap_atomic(buf->base);
+ buf->base = NULL;
+ buf->kmap_type = EROFS_NO_KMAP;
+}
+
+void erofs_put_metabuf(struct erofs_buf *buf)
+{
+ if (!buf->page)
+ return;
+ erofs_unmap_metabuf(buf);
+ put_page(buf->page);
+ buf->page = NULL;
+}
+
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ struct address_space *const mapping = inode->i_mapping;
+ erofs_off_t offset = blknr_to_addr(blkaddr);
+ pgoff_t index = offset >> PAGE_SHIFT;
+ struct page *page = buf->page;
+
+ if (!page || page->index != index) {
+ erofs_put_metabuf(buf);
+ page = read_cache_page_gfp(mapping, index,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (IS_ERR(page))
+ return page;
+ /* should already be PageUptodate, no need to lock page */
+ buf->page = page;
+ }
+ if (buf->kmap_type == EROFS_NO_KMAP) {
+ if (type == EROFS_KMAP)
+ buf->base = kmap(page);
+ else if (type == EROFS_KMAP_ATOMIC)
+ buf->base = kmap_atomic(page);
+ buf->kmap_type = type;
+ } else if (buf->kmap_type != type) {
+ DBG_BUGON(1);
+ return ERR_PTR(-EFAULT);
+ }
+ if (type == EROFS_NO_KMAP)
+ return NULL;
+ return buf->base + (offset & ~PAGE_MASK);
+}
+
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
{
- int err = 0;
erofs_blk_t nblocks, lastblk;
u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
-
- nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
lastblk = nblocks - tailendpacking;
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
-
if (offset < blknr_to_addr(lastblk)) {
map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
map->m_plen = blknr_to_addr(lastblk) - offset;
@@ -51,30 +91,23 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
vi->xattr_isize + erofs_blkoff(map->m_la);
map->m_plen = inode->i_size - offset;
- /* inline data should be located in one meta block */
- if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+ /* inline data should be located in the same meta block */
+ if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
vi->nid);
DBG_BUGON(1);
- err = -EFSCORRUPTED;
- goto err_out;
+ return -EFSCORRUPTED;
}
-
map->m_flags |= EROFS_MAP_META;
} else {
erofs_err(inode->i_sb,
"internal error @ nid: %llu (size %llu), m_la 0x%llx",
vi->nid, inode->i_size, map->m_la);
DBG_BUGON(1);
- err = -EIO;
- goto err_out;
+ return -EIO;
}
-
- map->m_llen = map->m_plen;
-err_out:
- trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
- return err;
+ return 0;
}
static int erofs_map_blocks(struct inode *inode,
@@ -83,12 +116,15 @@ static int erofs_map_blocks(struct inode *inode,
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
u64 chunknr;
unsigned int unit;
erofs_off_t pos;
+ void *kaddr;
int err = 0;
+ trace_erofs_map_blocks_enter(inode, map, flags);
+ map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
@@ -96,8 +132,10 @@ static int erofs_map_blocks(struct inode *inode,
goto out;
}
- if (vi->datalayout != EROFS_INODE_CHUNK_BASED)
- return erofs_map_blocks_flatmode(inode, map, flags);
+ if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
+ err = erofs_map_blocks_flatmode(inode, map, flags);
+ goto out;
+ }
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
unit = sizeof(*idx); /* chunk index */
@@ -108,17 +146,18 @@ static int erofs_map_blocks(struct inode *inode,
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos));
- if (IS_ERR(page))
- return PTR_ERR(page);
-
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
+ goto out;
+ }
map->m_la = chunknr << vi->chunkbits;
map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = page_address(page) + erofs_blkoff(pos);
+ __le32 *blkaddr = kaddr + erofs_blkoff(pos);
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
@@ -129,37 +168,79 @@ static int erofs_map_blocks(struct inode *inode,
goto out_unlock;
}
/* parse chunk indexes */
- idx = page_address(page) + erofs_blkoff(pos);
+ idx = kaddr + erofs_blkoff(pos);
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
break;
default:
- /* only one device is supported for now */
- if (idx->device_id) {
- erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
- le16_to_cpu(idx->device_id),
- chunknr, vi->nid);
- err = -EFSCORRUPTED;
- goto out_unlock;
- }
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
}
out_unlock:
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
out:
- map->m_llen = map->m_plen;
+ if (!err)
+ map->m_llen = map->m_plen;
+ trace_erofs_map_blocks_exit(inode, map, flags, 0);
return err;
}
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ /* primary device by default */
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+ map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = blknr_to_addr(dif->mapped_blkaddr);
+ length = blknr_to_addr(dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ break;
+ }
+ }
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
@@ -168,9 +249,19 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(inode->i_sb, &mdev);
+ if (ret)
+ return ret;
+
iomap->offset = map.m_la;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_daxdev;
+ else
+ iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
@@ -184,19 +275,21 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}
if (map.m_flags & EROFS_MAP_META) {
- struct page *ipage;
+ void *ptr;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ipage = erofs_get_meta_page(inode->i_sb,
- erofs_blknr(map.m_pa));
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- iomap->inline_data = page_address(ipage) +
- erofs_blkoff(map.m_pa);
- iomap->private = ipage;
+ ptr = erofs_read_metabuf(&buf, inode->i_sb,
+ erofs_blknr(mdev.m_pa), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa);
+ iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dax_part_off;
}
return 0;
}
@@ -204,12 +297,17 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
- struct page *ipage = iomap->private;
+ void *ptr = iomap->private;
+
+ if (ptr) {
+ struct erofs_buf buf = {
+ .page = kmap_to_page(ptr),
+ .base = ptr,
+ .kmap_type = EROFS_KMAP,
+ };
- if (ipage) {
DBG_BUGON(iomap->type != IOMAP_INLINE);
- unlock_page(ipage);
- put_page(ipage);
+ erofs_put_metabuf(&buf);
} else {
DBG_BUGON(iomap->type == IOMAP_INLINE);
}
@@ -287,7 +385,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0);
+ NULL, 0, 0);
if (err < 0)
return err;
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index a5bc4b1b7813..3efa686c7644 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,15 +16,12 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
-struct z_erofs_decompressor {
- /*
- * if destpages have sparsed pages, fill them with bounce pages.
- * it also check whether destpages indicate continuous physical memory.
- */
- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
- char *name;
+struct z_erofs_lz4_decompress_ctx {
+ struct z_erofs_decompress_req *rq;
+ /* # of encoded, decoded pages */
+ unsigned int inpages, outpages;
+ /* decoded block total length (used for in-place decompression) */
+ unsigned int oend;
};
int z_erofs_load_lz4_config(struct super_block *sb,
@@ -63,11 +60,14 @@ int z_erofs_load_lz4_config(struct super_block *sb,
return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
}
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+ struct page **pagepool)
{
- const unsigned int nr =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -77,7 +77,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < nr; ++i, ++j) {
+ for (i = j = 0; i < ctx->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -119,41 +119,36 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
void *inpage, unsigned int *inputmargin, int *maptype,
- bool support_0padding)
+ bool may_inplace)
{
- unsigned int nrpages_in, nrpages_out;
- unsigned int ofull, oend, inputsize, total, i, j;
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ unsigned int omargin, total, i, j;
struct page **in;
void *src, *tmp;
- inputsize = rq->inputsize;
- nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
- oend = rq->pageofs_out + rq->outputsize;
- ofull = PAGE_ALIGN(oend);
- nrpages_out = ofull >> PAGE_SHIFT;
-
if (rq->inplace_io) {
- if (rq->partial_decoding || !support_0padding ||
- ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+ omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ if (rq->partial_decoding || !may_inplace ||
+ omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < nrpages_in; ++i) {
+ for (i = 0; i < ctx->inpages; ++i) {
DBG_BUGON(rq->in[i] == NULL);
- for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+ for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
if (rq->out[j] == rq->in[i])
goto docopy;
}
}
- if (nrpages_in <= 1) {
+ if (ctx->inpages <= 1) {
*maptype = 0;
return inpage;
}
kunmap_atomic(inpage);
might_sleep();
- src = erofs_vm_map_ram(rq->in, nrpages_in);
+ src = erofs_vm_map_ram(rq->in, ctx->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
*maptype = 1;
@@ -162,7 +157,7 @@ static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = erofs_get_pcpubuf(nrpages_in);
+ src = erofs_get_pcpubuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_atomic(inpage);
@@ -189,35 +184,53 @@ docopy:
return src;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+/*
+ * Get the exact inputsize with zero_padding feature.
+ * - For LZ4, it should work if zero_padding feature is on (5.3+);
+ * - For MicroLZMA, it'd be enabled all the time.
+ */
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize)
+{
+ const char *padend;
+
+ padend = memchr_inv(padbuf, 0, padbufsize);
+ if (!padend)
+ return -EFSCORRUPTED;
+ rq->inputsize -= padend - padbuf;
+ rq->pageofs_in += padend - padbuf;
+ return 0;
+}
+
+static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
+ u8 *out)
{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *headpage, *src;
- bool support_0padding;
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
headpage = kmap_atomic(*rq->in);
- inputmargin = 0;
- support_0padding = false;
- /* decompression inplace is only safe when 0padding is enabled */
- if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
+ /* LZ4 decompression inplace is only safe if zero_padding is enabled */
+ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
support_0padding = true;
-
- while (!headpage[inputmargin & ~PAGE_MASK])
- if (!(++inputmargin & ~PAGE_MASK))
- break;
-
- if (inputmargin >= rq->inputsize) {
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (ret) {
kunmap_atomic(headpage);
- return -EIO;
+ return ret;
}
+ may_inplace = !((rq->pageofs_in + rq->inputsize) &
+ (EROFS_BLKSIZ - 1));
}
- rq->inputsize -= inputmargin;
- src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
- support_0padding);
+ inputmargin = rq->pageofs_in;
+ src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+ &maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -233,7 +246,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
ret, rq->inputsize, inputmargin, rq->outputsize);
- WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,12 +254,14 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
+ } else {
+ ret = 0;
}
if (maptype == 0) {
- kunmap_atomic(src);
+ kunmap_atomic(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+ vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
erofs_put_pcpubuf(src);
} else {
@@ -257,115 +271,57 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
return ret;
}
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
- .name = "shifted"
- },
- [Z_EROFS_COMPRESSION_LZ4] = {
- .prepare_destpages = z_erofs_lz4_prepare_destpages,
- .decompress = z_erofs_lz4_decompress,
- .name = "lz4"
- },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
- unsigned short pageofs_out,
- unsigned int outputsize)
-{
- const char *end = dst + outputsize;
- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
- const char *cur = dst - pageofs_out;
-
- while (cur < end) {
- struct page *const page = *out++;
-
- if (page) {
- char *buf = kmap_atomic(page);
-
- if (cur >= dst) {
- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
- end - cur));
- } else {
- memcpy(buf + pageofs_out, cur + pageofs_out,
- min_t(uint, righthalf, end - cur));
- }
- kunmap_atomic(buf);
- }
- cur += PAGE_SIZE;
- }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+ struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
- /* two optimized fast paths only for non bigpcluster cases yet */
- if (rq->inputsize <= PAGE_SIZE) {
- if (nrpages_out == 1 && !rq->inplace_io) {
- DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
- dst_maptype = 0;
- goto dstmap_out;
- }
+ ctx.rq = rq;
+ ctx.oend = rq->pageofs_out + rq->outputsize;
+ ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
+ ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(1);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
- }
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
}
/* general decoding path which can be used for all cases */
- ret = alg->prepare_destpages(rq, pagepool);
- if (ret < 0)
+ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
+ if (ret < 0) {
return ret;
- if (ret) {
+ } else if (ret > 0) {
dst = page_address(*rq->out);
dst_maptype = 1;
- goto dstmap_out;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, ctx.outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
}
- dst = erofs_vm_map_ram(rq->out, nrpages_out);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
-
dstmap_out:
- ret = alg->decompress(rq, dst + rq->pageofs_out);
-
+ ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, nrpages_out);
+ vm_unmap_ram(dst, ctx.outpages);
return ret;
}
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+ const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
+ PAGE_SIZE - rq->pageofs_out);
unsigned char *src, *dst;
if (nrpages_out > 2) {
@@ -378,7 +334,7 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
return 0;
}
- src = kmap_atomic(*rq->in);
+ src = kmap_atomic(*rq->in) + rq->pageofs_in;
if (rq->out[0]) {
dst = kmap_atomic(rq->out[0]);
memcpy(dst + rq->pageofs_out, src, righthalf);
@@ -399,10 +355,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
return 0;
}
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_shifted_transform,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+};
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+ struct page **pagepool)
{
- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
- return z_erofs_shifted_transform(rq, pagepool);
- return z_erofs_decompress_generic(rq, pagepool);
+ return decompressors[rq->alg].decompress(rq, pagepool);
}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000000..05a3063cf2bc
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (err) {
+ kunmap(*rq->in);
+ return err;
+ }
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + rq->pageofs_in;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->in[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index eee9b0b31b63..18e59821c597 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "internal.h"
@@ -67,7 +68,7 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
- struct address_space *mapping = dir->i_mapping;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
const size_t dirsize = i_size_read(dir);
unsigned int i = ctx->pos / EROFS_BLKSIZ;
unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
@@ -75,26 +76,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
bool initial = true;
while (ctx->pos < dirsize) {
- struct page *dentry_page;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- dentry_page = read_mapping_page(mapping, i, NULL);
- if (dentry_page == ERR_PTR(-ENOMEM)) {
- err = -ENOMEM;
- break;
- } else if (IS_ERR(dentry_page)) {
+ de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
erofs_err(dir->i_sb,
"fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
- err = -EFSCORRUPTED;
+ err = PTR_ERR(de);
break;
}
- de = (struct erofs_dirent *)kmap(dentry_page);
-
nameoff = le16_to_cpu(de->nameoff);
-
if (nameoff < sizeof(struct erofs_dirent) ||
nameoff >= PAGE_SIZE) {
erofs_err(dir->i_sb,
@@ -119,10 +113,6 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
err = erofs_fill_dentries(dir, ctx, de, &ofs,
nameoff, maxsize);
skip_this:
- kunmap(dentry_page);
-
- put_page(dentry_page);
-
ctx->pos = blknr_to_addr(i) + ofs;
if (err)
@@ -130,6 +120,7 @@ skip_this:
++i;
ofs = 0;
}
+ erofs_put_metabuf(&buf);
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b0b23f41abc3..1238ca104f09 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,23 +12,41 @@
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
* be incompatible with this kernel version.
*/
-#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
+#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
+#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING)
#define EROFS_SB_EXTSLOT_SIZE 16
+struct erofs_deviceslot {
+ union {
+ u8 uuid[16]; /* used for device manager later */
+ u8 userdata[64]; /* digest(sha256), etc. */
+ } u;
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
+
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
@@ -54,7 +72,9 @@ struct erofs_super_block {
/* customized sliding window size instead of 64k by default */
__le16 lz4_max_distance;
} __packed u1;
- __u8 reserved2[42];
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 reserved2[38];
};
/*
@@ -167,8 +187,8 @@ struct erofs_inode_extended {
__le32 i_uid;
__le32 i_gid;
- __le64 i_ctime;
- __le32 i_ctime_nsec;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
__le32 i_nlink;
__u8 i_reserved2[16];
};
@@ -192,7 +212,7 @@ struct erofs_xattr_ibody_header {
__le32 h_reserved;
__u8 h_shared_count;
__u8 h_reserved2[7];
- __le32 h_shared_xattrs[0]; /* shared xattr id array */
+ __le32 h_shared_xattrs[]; /* shared xattr id array */
};
/* Name indexes */
@@ -209,7 +229,7 @@ struct erofs_xattr_entry {
__u8 e_name_index; /* attribute name index */
__le16 e_value_size; /* size of attribute value */
/* followed by e_name and e_value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
@@ -238,7 +258,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 8-byte inode chunk indexes */
struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */
- __le16 device_id; /* back-end storage id, always 0 for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
__le32 blkaddr; /* start block address of this inode chunk */
};
@@ -247,10 +267,11 @@ struct erofs_inode_chunk_index {
/* available compression algorithm types (for h_algorithmtype) */
enum {
- Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_MAX
};
-#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
/* 14 bytes (+ length field = 16 bytes) */
struct z_erofs_lz4_cfgs {
@@ -259,19 +280,32 @@ struct z_erofs_lz4_cfgs {
u8 reserved[10];
} __packed;
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
* (4B) + 2B + (4B) if compacted 2B is on.
* bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
* bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
+ * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
+#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
struct z_erofs_map_header {
- __le32 h_reserved1;
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
__le16 h_advise;
/*
* bit 0-3 : algorithm type of head 1 (logical cluster type 01);
@@ -288,35 +322,34 @@ struct z_erofs_map_header {
#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
/*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- * 0 - literal (uncompressed) cluster
- * 1 - compressed cluster (for the head logical cluster)
- * 2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
*
* In detail,
- * 0 - literal (uncompressed) cluster,
+ * 0 - literal (uncompressed) lcluster,
* di_advise = 0
- * di_clusterofs = the literal data offset of the cluster
- * di_blkaddr = the blkaddr of the literal cluster
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
*
- * 1 - compressed cluster (for the head logical cluster)
- * di_advise = 1
- * di_clusterofs = the decompressed data offset of the cluster
- * di_blkaddr = the blkaddr of the compressed cluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
*
- * 2 - compressed cluster (for the other logical clusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
* di_advise = 2
* di_clusterofs =
- * the decompressed data offset in its own head cluster
- * di_u.delta[0] = distance to its corresponding head cluster
- * di_u.delta[1] = distance to its corresponding tail cluster
- * (di_advise could be 0, 1 or 2)
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
*/
enum {
Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
Z_EROFS_VLE_CLUSTER_TYPE_MAX
};
@@ -384,6 +417,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
sizeof(struct z_erofs_vle_decompressed_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index a552399e211d..e8b37ba5e9ad 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -13,8 +13,8 @@
* the inode payload page if it's an extended inode) in order to fill
* inline data if possible.
*/
-static struct page *erofs_read_inode(struct inode *inode,
- unsigned int *ofs)
+static void *erofs_read_inode(struct erofs_buf *buf,
+ struct inode *inode, unsigned int *ofs)
{
struct super_block *sb = inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -22,7 +22,7 @@ static struct page *erofs_read_inode(struct inode *inode,
const erofs_off_t inode_loc = iloc(sbi, vi->nid);
erofs_blk_t blkaddr, nblks = 0;
- struct page *page;
+ void *kaddr;
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
unsigned int ifmt;
@@ -34,14 +34,14 @@ static struct page *erofs_read_inode(struct inode *inode,
erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
__func__, vi->nid, *ofs, blkaddr);
- page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(page)) {
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(page));
- return page;
+ vi->nid, PTR_ERR(kaddr));
+ return kaddr;
}
- dic = page_address(page) + *ofs;
+ dic = kaddr + *ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
@@ -62,12 +62,12 @@ static struct page *erofs_read_inode(struct inode *inode,
switch (erofs_inode_version(ifmt)) {
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
- /* check if the inode acrosses page boundary */
- if (*ofs + vi->inode_isize <= PAGE_SIZE) {
+ /* check if the extended inode acrosses block boundary */
+ if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
*ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
} else {
- const unsigned int gotten = PAGE_SIZE - *ofs;
+ const unsigned int gotten = EROFS_BLKSIZ - *ofs;
copied = kmalloc(vi->inode_isize, GFP_NOFS);
if (!copied) {
@@ -75,18 +75,16 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
memcpy(copied, dic, gotten);
- unlock_page(page);
- put_page(page);
-
- page = erofs_get_meta_page(sb, blkaddr + 1);
- if (IS_ERR(page)) {
- erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld",
- vi->nid, PTR_ERR(page));
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
+ vi->nid, PTR_ERR(kaddr));
kfree(copied);
- return page;
+ return kaddr;
}
*ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, page_address(page), *ofs);
+ memcpy((u8 *)copied + gotten, kaddr, *ofs);
die = copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
@@ -115,8 +113,8 @@ static struct page *erofs_read_inode(struct inode *inode,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
@@ -192,7 +190,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_flags &= ~S_DAX;
- if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
vi->datalayout == EROFS_INODE_FLAT_PLAIN)
inode->i_flags |= S_DAX;
if (!nblks)
@@ -200,7 +198,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
else
inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
- return page;
+ return kaddr;
bogusimode:
erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
@@ -209,12 +207,11 @@ bogusimode:
err_out:
DBG_BUGON(1);
kfree(copied);
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(buf);
return ERR_PTR(err);
}
-static int erofs_fill_symlink(struct inode *inode, void *data,
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
unsigned int m_pofs)
{
struct erofs_inode *vi = EROFS_I(inode);
@@ -222,7 +219,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
/* if it cannot be handled with fast symlink scheme */
if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= PAGE_SIZE) {
+ inode->i_size >= EROFS_BLKSIZ) {
inode->i_op = &erofs_symlink_iops;
return 0;
}
@@ -232,8 +229,8 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
return -ENOMEM;
m_pofs += vi->xattr_isize;
- /* inline symlink data shouldn't cross page boundary as well */
- if (m_pofs + inode->i_size > PAGE_SIZE) {
+ /* inline symlink data shouldn't cross block boundary */
+ if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
kfree(lnk);
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
@@ -241,8 +238,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
-
- memcpy(lnk, data + m_pofs, inode->i_size);
+ memcpy(lnk, kaddr + m_pofs, inode->i_size);
lnk[inode->i_size] = '\0';
inode->i_link = lnk;
@@ -253,16 +249,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
static int erofs_fill_inode(struct inode *inode, int isdir)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *kaddr;
unsigned int ofs;
int err = 0;
trace_erofs_fill_inode(inode, isdir);
/* read inode base data from disk */
- page = erofs_read_inode(inode, &ofs);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ kaddr = erofs_read_inode(&buf, inode, &ofs);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
@@ -278,7 +275,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
inode->i_fop = &erofs_dir_fops;
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, page_address(page), ofs);
+ err = erofs_fill_symlink(inode, kaddr, ofs);
if (err)
goto out_unlock;
inode_nohighmem(inode);
@@ -302,8 +299,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
inode->i_mapping->a_ops = &erofs_raw_access_aops;
out_unlock:
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return err;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9524e155b38f..5298c4ee277d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,12 +47,28 @@ typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;
-struct erofs_fs_context {
+struct erofs_device_info {
+ char *path;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+ u64 dax_part_off;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+enum {
+ EROFS_SYNC_DECOMPRESS_AUTO,
+ EROFS_SYNC_DECOMPRESS_FORCE_ON,
+ EROFS_SYNC_DECOMPRESS_FORCE_OFF
+};
+
+struct erofs_mount_opts {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
- /* strategy of sync decompression (false - auto, true - force on) */
- bool readahead_sync_decompress;
+ /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
+ unsigned int sync_decompress;
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
@@ -60,6 +76,18 @@ struct erofs_fs_context {
unsigned int mount_opt;
};
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+};
+
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +97,7 @@ struct erofs_sb_lz4_info {
};
struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
struct list_head list;
@@ -85,12 +114,17 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct erofs_dev_context *devs;
struct dax_device *dax_dev;
- u32 blocks;
+ u64 dax_part_off;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
/* inode slot unit size in bit shift */
unsigned char islotbits;
@@ -109,7 +143,9 @@ struct erofs_sb_info {
u32 feature_compat;
u32 feature_incompat;
- struct erofs_fs_context ctx; /* options */
+ /* sysfs support */
+ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
+ struct completion s_kobj_unregister;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +157,9 @@ struct erofs_sb_info {
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
enum {
EROFS_ZIP_CACHE_DISABLED,
@@ -217,6 +253,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
#error erofs cannot be used in this platform
#endif
+enum erofs_kmap_type {
+ EROFS_NO_KMAP, /* don't map the buffer */
+ EROFS_KMAP, /* use kmap() to map the buffer */
+ EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+};
+
+struct erofs_buf {
+ struct page *page;
+ void *base;
+ enum erofs_kmap_type kmap_type;
+};
+#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
+
#define ROOT_NID(sb) ((sb)->root_nid)
#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
@@ -234,9 +283,13 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
return sbi->feature_##compat & EROFS_FEATURE_##feature; \
}
-EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING)
EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
+EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -271,6 +324,9 @@ struct erofs_inode {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
+ unsigned long z_tailextent_headlcn;
+ erofs_off_t z_idataoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -307,6 +363,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
EROFS_I_DATALAYOUT_BITS);
}
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +407,7 @@ extern const struct address_space_operations z_erofs_aops;
* of the corresponding uncompressed data in the file.
*/
enum {
- BH_Zipped = BH_PrivateStart,
+ BH_Encoded = BH_PrivateStart,
BH_FullMapped,
};
@@ -346,18 +415,20 @@ enum {
#define EROFS_MAP_MAPPED (1 << BH_Mapped)
/* Located in metadata (could be copied from bd_inode) */
#define EROFS_MAP_META (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
struct erofs_map_blocks {
+ struct erofs_buf buf;
+
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
+ unsigned short m_deviceid;
+ char m_algorithmformat;
unsigned int m_flags;
-
- struct page *mpage;
};
/* Flags used by erofs_map_blocks_flatmode() */
@@ -367,6 +438,15 @@ struct erofs_map_blocks {
* approach instead if possible since it's more metadata lightweight.)
*/
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0004
+/* Used to map tail extent for tailpacking inline pcluster */
+#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
/* zmap.c */
extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +466,24 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+struct erofs_map_dev {
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+ u64 m_dax_part_off;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
+
/* data.c */
extern const struct file_operations erofs_file_fops;
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+void erofs_unmap_metabuf(struct erofs_buf *buf);
+void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -442,8 +537,21 @@ int erofs_pcpubuf_growsize(unsigned int nrpages);
void erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
+/* sysfs.c */
+int erofs_register_sysfs(struct super_block *sb);
+void erofs_unregister_sysfs(struct super_block *sb);
+int __init erofs_init_sysfs(void);
+void erofs_exit_sysfs(void);
+
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+ struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +591,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size) {
+ if (lzma) {
+ erofs_err(sb, "lzma algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 8629e616028c..554efa363317 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
@@ -86,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static struct page *find_target_block_classic(struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *find_target_block_classic(struct erofs_buf *target,
+ struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
{
unsigned int startprfx, endprfx;
int head, back;
- struct address_space *const mapping = dir->i_mapping;
- struct page *candidate = ERR_PTR(-ENOENT);
+ void *candidate = ERR_PTR(-ENOENT);
startprfx = endprfx = 0;
head = 0;
@@ -101,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir,
while (head <= back) {
const int mid = head + (back - head) / 2;
- struct page *page = read_mapping_page(mapping, mid, NULL);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
- if (!IS_ERR(page)) {
- struct erofs_dirent *de = kmap_atomic(page);
+ de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff,
EROFS_BLKSIZ);
const int ndirents = nameoff / sizeof(*de);
@@ -113,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir,
struct erofs_qstr dname;
if (!ndirents) {
- kunmap_atomic(de);
- put_page(page);
+ erofs_put_metabuf(&buf);
erofs_err(dir->i_sb,
"corrupted dir block %d @ nid %llu",
mid, EROFS_I(dir)->nid);
DBG_BUGON(1);
- page = ERR_PTR(-EFSCORRUPTED);
+ de = ERR_PTR(-EFSCORRUPTED);
goto out;
}
@@ -135,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- kunmap_atomic(de);
if (!diff) {
*_ndirents = 0;
@@ -145,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir,
startprfx = matched;
if (!IS_ERR(candidate))
- put_page(candidate);
- candidate = page;
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
*_ndirents = ndirents;
} else {
- put_page(page);
+ erofs_put_metabuf(&buf);
back = mid - 1;
endprfx = matched;
@@ -158,8 +159,8 @@ static struct page *find_target_block_classic(struct inode *dir,
}
out: /* free if the candidate is valid */
if (!IS_ERR(candidate))
- put_page(candidate);
- return page;
+ erofs_put_metabuf(target);
+ return de;
}
return candidate;
}
@@ -169,8 +170,7 @@ int erofs_namei(struct inode *dir,
erofs_nid_t *nid, unsigned int *d_type)
{
int ndirents;
- struct page *page;
- void *data;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
struct erofs_qstr qn;
@@ -181,26 +181,20 @@ int erofs_namei(struct inode *dir,
qn.end = name->name + name->len;
ndirents = 0;
- page = find_target_block_classic(dir, &qn, &ndirents);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
- data = kmap_atomic(page);
/* the target page has been mapped */
if (ndirents)
- de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
- else
- de = (struct erofs_dirent *)data;
+ de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
*d_type = de->file_type;
}
-
- kunmap_atomic(data);
- put_page(page);
-
+ erofs_put_metabuf(&buf);
return PTR_ERR_OR_ZERO(de);
}
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
index 6c885575128a..a2efd833d1b6 100644
--- a/fs/erofs/pcpubuf.c
+++ b/fs/erofs/pcpubuf.c
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(pcb_resize_mutex);
static unsigned int pcb_nrpages;
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
int delta, cpu, ret, i;
mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
vunmap(old_ptr);
free_pagearray:
while (i)
- list_add(&oldpages[--i]->lru, &pagepool);
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
kfree(oldpages);
if (ret)
break;
}
pcb_nrpages = nrpages;
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
out:
mutex_unlock(&pcb_resize_mutex);
return ret;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 11b88559f8bf..0c4b41130c2f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include <linux/module.h>
#include <linux/buffer_head.h>
@@ -83,7 +84,7 @@ static void erofs_inode_init_once(void *ptr)
static struct inode *erofs_alloc_inode(struct super_block *sb)
{
struct erofs_inode *vi =
- kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
@@ -124,80 +125,50 @@ static bool check_layout_compatibility(struct super_block *sb,
#ifdef CONFIG_EROFS_FS_ZIP
/* read variable-sized metadata, offset will be aligned by 4-byte */
-static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp)
{
- struct page *page = *pagep;
u8 *buffer, *ptr;
int len, i, cnt;
- erofs_blk_t blk;
*offset = round_up(*offset, 4);
- blk = erofs_blknr(*offset);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return ptr;
- if (!page || page->index != blk) {
- if (page) {
- unlock_page(page);
- put_page(page);
- }
- page = erofs_get_meta_page(sb, blk);
- if (IS_ERR(page))
- goto err_nullpage;
- }
-
- ptr = kmap(page);
len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
- if (!buffer) {
- buffer = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
*offset += sizeof(__le16);
*lengthp = len;
for (i = 0; i < len; i += cnt) {
cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
- blk = erofs_blknr(*offset);
-
- if (!page || page->index != blk) {
- if (page) {
- kunmap(page);
- unlock_page(page);
- put_page(page);
- }
- page = erofs_get_meta_page(sb, blk);
- if (IS_ERR(page)) {
- kfree(buffer);
- goto err_nullpage;
- }
- ptr = kmap(page);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset),
+ EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ kfree(buffer);
+ return ptr;
}
memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
*offset += cnt;
}
-out:
- kunmap(page);
- *pagep = page;
return buffer;
-err_nullpage:
- *pagep = NULL;
- return page;
}
static int erofs_load_compr_cfgs(struct super_block *sb,
struct erofs_super_block *dsb)
{
- struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
unsigned int algs, alg;
erofs_off_t offset;
- int size, ret;
+ int size, ret = 0;
- sbi = EROFS_SB(sb);
sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
-
if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
@@ -205,39 +176,35 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
- page = NULL;
alg = 0;
- ret = 0;
-
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
void *data;
if (!(algs & 1))
continue;
- data = erofs_read_metadata(sb, &page, &offset, &size);
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
- goto err;
+ break;
}
switch (alg) {
case Z_EROFS_COMPRESSION_LZ4:
ret = z_erofs_load_lz4_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_LZMA:
+ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
}
kfree(data);
if (ret)
- goto err;
- }
-err:
- if (page) {
- unlock_page(page);
- put_page(page);
+ break;
}
+ erofs_put_metabuf(&buf);
return ret;
}
#else
@@ -252,24 +219,81 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
+static int erofs_init_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_device_info *dif;
+ struct erofs_deviceslot *dis;
+ void *ptr;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ struct block_device *bdev;
+
+ ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
+ EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ break;
+ }
+ dis = ptr + erofs_blkoff(pos);
+
+ bdev = blkdev_get_by_path(dif->path,
+ FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
+ break;
+ }
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ pos += EROFS_DEVT_SLOT_SIZE;
+ }
+ up_read(&sbi->devs->rwsem);
+ erofs_put_metabuf(&buf);
+ return err;
+}
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
unsigned int blkszbits;
void *data;
int ret;
- page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
- return PTR_ERR(page);
+ return PTR_ERR(data);
}
sbi = EROFS_SB(sb);
-
- data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -303,7 +327,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,9 +354,16 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_load_compr_cfgs(sb, dsb);
else
ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ if (ret < 0)
+ goto out;
+
+ /* handle multiple devices */
+ ret = erofs_init_devices(sb, dsb);
+
+ if (erofs_sb_has_ztailpacking(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
out:
- kunmap(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return ret;
}
@@ -340,15 +371,15 @@ out:
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->max_sync_decompress_pages = 3;
- ctx->readahead_sync_decompress = false;
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
#endif
}
@@ -358,6 +389,7 @@ enum {
Opt_cache_strategy,
Opt_dax,
Opt_dax_enum,
+ Opt_device,
Opt_err
};
@@ -381,6 +413,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
erofs_param_cache_strategy),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
{}
};
@@ -392,12 +425,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(ctx, DAX_ALWAYS);
- clear_opt(ctx, DAX_NEVER);
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(ctx, DAX_NEVER);
- clear_opt(ctx, DAX_ALWAYS);
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -412,9 +445,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+ struct erofs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ struct erofs_device_info *dif;
+ int opt, ret;
opt = fs_parse(fc, erofs_fs_parameters, param, &result);
if (opt < 0)
@@ -424,9 +458,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
else
- clear_opt(ctx, XATTR_USER);
+ clear_opt(&ctx->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -434,16 +468,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
else
- clear_opt(ctx, POSIX_ACL);
+ clear_opt(&ctx->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = result.uint_32;
+ ctx->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -456,6 +490,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
if (!erofs_fc_set_dax_mode(fc, result.uint_32))
return -EINVAL;
break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
+ }
+ ++ctx->devs->extra_devices;
+ break;
default:
return -ENOPARAM;
}
@@ -479,25 +532,29 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void erofs_managed_cache_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * We could introduce an extra locking instead but it seems unnecessary.
+ */
+static void erofs_managed_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- const unsigned int stop = length + offset;
+ const size_t stop = length + offset;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
/* Check for potential overflow in debug mode */
- DBG_BUGON(stop > PAGE_SIZE || stop < length);
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
- if (offset == 0 && stop == PAGE_SIZE)
- while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+ if (offset == 0 && stop == folio_size(folio))
+ while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
.releasepage = erofs_managed_cache_releasepage,
- .invalidatepage = erofs_managed_cache_invalidatepage,
+ .invalidate_folio = erofs_managed_cache_invalidate_folio,
};
static int erofs_init_managed_cache(struct super_block *sb)
@@ -512,8 +569,7 @@ static int erofs_init_managed_cache(struct super_block *sb)
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping,
- GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->managed_cache = inode;
return 0;
}
@@ -540,15 +596,22 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->opt = ctx->opt;
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+
err = erofs_read_superblock(sb);
if (err)
return err;
- if (test_opt(ctx, DAX_ALWAYS) &&
- !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(ctx, DAX_ALWAYS);
+ if (test_opt(&sbi->opt, DAX_ALWAYS)) {
+ BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE);
+
+ if (!sbi->dax_dev) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +620,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sb->s_xattr = erofs_xattr_handlers;
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
-
#ifdef CONFIG_EROFS_FS_ZIP
xa_init(&sbi->managed_pslots);
#endif
@@ -590,6 +651,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ err = erofs_register_sysfs(sb);
+ if (err)
+ return err;
+
erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
return 0;
}
@@ -607,20 +672,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
DBG_BUGON(!sb_rdonly(sb));
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&ctx->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
+ sbi->opt = ctx->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
}
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +721,21 @@ static const struct fs_context_operations erofs_context_ops = {
static int erofs_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
- if (!fc->fs_private)
- return -ENOMEM;
+ struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- /* set default mount options */
- erofs_default_options(fc->fs_private);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
fc->ops = &erofs_context_ops;
-
return 0;
}
@@ -659,6 +754,8 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
+
+ erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
@@ -671,6 +768,7 @@ static void erofs_put_super(struct super_block *sb)
DBG_BUGON(!sbi);
+ erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
@@ -706,11 +804,19 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
goto zip_err;
+ err = erofs_init_sysfs();
+ if (err)
+ goto sysfs_err;
+
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
@@ -718,8 +824,12 @@ static int __init erofs_module_init(void)
return 0;
fs_err:
+ erofs_exit_sysfs();
+sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_lzma_exit();
+lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +840,14 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
- z_erofs_exit_zip_subsystem();
- erofs_exit_shrinker();
- /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+
+ erofs_exit_sysfs();
+ z_erofs_exit_zip_subsystem();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
@@ -748,7 +861,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
- buf->f_blocks = sbi->blocks;
+ buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
buf->f_files = ULLONG_MAX;
@@ -763,31 +876,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
- struct erofs_fs_context *ctx = &sbi->ctx;
+ struct erofs_mount_opts *opt = &sbi->opt;
#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(ctx, XATTR_USER))
+ if (test_opt(opt, XATTR_USER))
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(opt, POSIX_ACL))
seq_puts(seq, ",acl");
else
seq_puts(seq, ",noacl");
#endif
#ifdef CONFIG_EROFS_FS_ZIP
- if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
seq_puts(seq, ",cache_strategy=disabled");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
seq_puts(seq, ",cache_strategy=readahead");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
- if (test_opt(ctx, DAX_ALWAYS))
+ if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
- if (test_opt(ctx, DAX_NEVER))
+ if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
new file mode 100644
index 000000000000..f3babf1e6608
--- /dev/null
+++ b/fs/erofs/sysfs.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd.
+ * https://www.oppo.com/
+ */
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+
+#include "internal.h"
+
+enum {
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_bool,
+};
+
+enum {
+ struct_erofs_sb_info,
+ struct_erofs_mount_opts,
+};
+
+struct erofs_attr {
+ struct attribute attr;
+ short attr_id;
+ int struct_type, offset;
+};
+
+#define EROFS_ATTR(_name, _mode, _id) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name)
+#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature)
+
+#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .struct_type = struct_##_struct, \
+ .offset = offsetof(struct _struct, _name),\
+}
+
+#define EROFS_ATTR_RW(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct)
+
+#define EROFS_RO_ATTR(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct)
+
+#define EROFS_ATTR_RW_UI(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_ui, _struct)
+
+#define EROFS_ATTR_RW_BOOL(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_bool, _struct)
+
+#define ATTR_LIST(name) (&erofs_attr_##name.attr)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+#endif
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP
+ ATTR_LIST(sync_decompress),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs);
+
+/* Features this copy of erofs supports */
+EROFS_ATTR_FEATURE(zero_padding);
+EROFS_ATTR_FEATURE(compr_cfgs);
+EROFS_ATTR_FEATURE(big_pcluster);
+EROFS_ATTR_FEATURE(chunked_file);
+EROFS_ATTR_FEATURE(device_table);
+EROFS_ATTR_FEATURE(compr_head2);
+EROFS_ATTR_FEATURE(sb_chksum);
+EROFS_ATTR_FEATURE(ztailpacking);
+
+static struct attribute *erofs_feat_attrs[] = {
+ ATTR_LIST(zero_padding),
+ ATTR_LIST(compr_cfgs),
+ ATTR_LIST(big_pcluster),
+ ATTR_LIST(chunked_file),
+ ATTR_LIST(device_table),
+ ATTR_LIST(compr_head2),
+ ATTR_LIST(sb_chksum),
+ ATTR_LIST(ztailpacking),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_feat);
+
+static unsigned char *__struct_ptr(struct erofs_sb_info *sbi,
+ int struct_type, int offset)
+{
+ if (struct_type == struct_erofs_sb_info)
+ return (unsigned char *)sbi + offset;
+ if (struct_type == struct_erofs_mount_opts)
+ return (unsigned char *)&sbi->opt + offset;
+ return NULL;
+}
+
+static ssize_t erofs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+
+ switch (a->attr_id) {
+ case attr_feature:
+ return sysfs_emit(buf, "supported\n");
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr);
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ }
+ return 0;
+}
+
+static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != (unsigned int)t)
+ return -ERANGE;
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (!strcmp(a->attr.name, "sync_decompress") &&
+ (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF))
+ return -EINVAL;
+#endif
+ *(unsigned int *)ptr = t;
+ return len;
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != 0 && t != 1)
+ return -EINVAL;
+ *(bool *)ptr = !!t;
+ return len;
+ }
+ return 0;
+}
+
+static void erofs_sb_release(struct kobject *kobj)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops erofs_attr_ops = {
+ .show = erofs_attr_show,
+ .store = erofs_attr_store,
+};
+
+static struct kobj_type erofs_sb_ktype = {
+ .default_groups = erofs_groups,
+ .sysfs_ops = &erofs_attr_ops,
+ .release = erofs_sb_release,
+};
+
+static struct kobj_type erofs_ktype = {
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kset erofs_root = {
+ .kobj = {.ktype = &erofs_ktype},
+};
+
+static struct kobj_type erofs_feat_ktype = {
+ .default_groups = erofs_feat_groups,
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kobject erofs_feat = {
+ .kset = &erofs_root,
+};
+
+int erofs_register_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ int err;
+
+ sbi->s_kobj.kset = &erofs_root;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto put_sb_kobj;
+ return 0;
+
+put_sb_kobj:
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return err;
+}
+
+void erofs_unregister_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
+}
+
+int __init erofs_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&erofs_root.kobj, "erofs");
+ erofs_root.kobj.parent = fs_kobj;
+ ret = kset_register(&erofs_root);
+ if (ret)
+ goto root_err;
+
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (ret)
+ goto feat_err;
+ return ret;
+
+feat_err:
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+root_err:
+ return ret;
+}
+
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index bd86067a63f7..ec9a1d780dc1 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -6,20 +6,29 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
- struct page *page;
+ struct page *page = *pagepool;
- if (!list_empty(pool)) {
- page = lru_to_page(pool);
+ if (page) {
DBG_BUGON(page_ref_count(page) != 1);
- list_del(&page->lru);
+ *pagepool = (struct page *)page_private(page);
} else {
page = alloc_page(gfp);
}
return page;
}
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
+}
+
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
@@ -141,7 +150,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
- DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp);
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
/* last refcount should be connected with its managed pslot. */
erofs_workgroup_unfreeze(grp, 0);
@@ -156,15 +165,19 @@ static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned int freed = 0;
unsigned long index;
+ xa_lock(&sbi->managed_pslots);
xa_for_each(&sbi->managed_pslots, index, grp) {
/* try to shrink each valid workgroup */
if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
+ xa_unlock(&sbi->managed_pslots);
++freed;
if (!--nr_shrink)
- break;
+ return freed;
+ xa_lock(&sbi->managed_pslots);
}
+ xa_unlock(&sbi->managed_pslots);
return freed;
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 778f2c52295d..8106bcb5a38d 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -2,39 +2,20 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
#include "xattr.h"
struct xattr_iter {
struct super_block *sb;
- struct page *page;
+ struct erofs_buf buf;
void *kaddr;
erofs_blk_t blkaddr;
unsigned int ofs;
};
-static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
-{
- /* the only user of kunmap() is 'init_inode_xattrs' */
- if (!atomic)
- kunmap(it->page);
- else
- kunmap_atomic(it->kaddr);
-
- unlock_page(it->page);
- put_page(it->page);
-}
-
-static inline void xattr_iter_end_final(struct xattr_iter *it)
-{
- if (!it->page)
- return;
-
- xattr_iter_end(it, true);
-}
-
static int init_inode_xattrs(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
@@ -43,7 +24,6 @@ static int init_inode_xattrs(struct inode *inode)
struct erofs_xattr_ibody_header *ih;
struct super_block *sb;
struct erofs_sb_info *sbi;
- bool atomic_map;
int ret = 0;
/* the most case is that xattrs of this inode are initialized. */
@@ -91,26 +71,23 @@ static int init_inode_xattrs(struct inode *inode)
sb = inode->i_sb;
sbi = EROFS_SB(sb);
+ it.buf = __EROFS_BUF_INITIALIZER;
it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
- it.page = erofs_get_meta_page(sb, it.blkaddr);
- if (IS_ERR(it.page)) {
- ret = PTR_ERR(it.page);
+ /* read in shared xattr array (non-atomic, see kmalloc below) */
+ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
+ ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- /* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = kmap(it.page);
- atomic_map = false;
-
ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
-
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
if (!vi->xattr_shared_xattrs) {
- xattr_iter_end(&it, atomic_map);
+ erofs_put_metabuf(&it.buf);
ret = -ENOMEM;
goto out_unlock;
}
@@ -122,25 +99,22 @@ static int init_inode_xattrs(struct inode *inode)
if (it.ofs >= EROFS_BLKSIZ) {
/* cannot be unaligned */
DBG_BUGON(it.ofs != EROFS_BLKSIZ);
- xattr_iter_end(&it, atomic_map);
- it.page = erofs_get_meta_page(sb, ++it.blkaddr);
- if (IS_ERR(it.page)) {
+ it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
+ EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
- ret = PTR_ERR(it.page);
+ ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
-
- it.kaddr = kmap_atomic(it.page);
- atomic_map = true;
it.ofs = 0;
}
vi->xattr_shared_xattrs[i] =
le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
it.ofs += sizeof(__le32);
}
- xattr_iter_end(&it, atomic_map);
+ erofs_put_metabuf(&it.buf);
/* paired with smp_mb() at the beginning of the function. */
smp_mb();
@@ -172,19 +146,11 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
if (it->ofs < EROFS_BLKSIZ)
return 0;
- xattr_iter_end(it, true);
-
it->blkaddr += erofs_blknr(it->ofs);
-
- it->page = erofs_get_meta_page(it->sb, it->blkaddr);
- if (IS_ERR(it->page)) {
- int err = PTR_ERR(it->page);
-
- it->page = NULL;
- return err;
- }
-
- it->kaddr = kmap_atomic(it->page);
+ it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
return 0;
}
@@ -207,11 +173,10 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
- it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
- if (IS_ERR(it->page))
- return PTR_ERR(it->page);
-
- it->kaddr = kmap_atomic(it->page);
+ it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
}
@@ -272,7 +237,7 @@ static int xattr_foreach(struct xattr_iter *it,
it->ofs = 0;
}
- slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
entry.e_name_len - processed);
/* handle name */
@@ -307,7 +272,7 @@ static int xattr_foreach(struct xattr_iter *it,
it->ofs = 0;
}
- slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
value_sz - processed);
op->value(it, processed, it->kaddr + it->ofs, slice);
it->ofs += slice;
@@ -386,8 +351,6 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
if (ret != -ENOATTR)
break;
}
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_size;
}
@@ -404,32 +367,22 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
-
- if (!i || blkaddr != it->it.blkaddr) {
- if (i)
- xattr_iter_end(&it->it, true);
-
- it->it.page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(it->it.page))
- return PTR_ERR(it->it.page);
-
- it->it.kaddr = kmap_atomic(it->it.page);
- it->it.blkaddr = blkaddr;
- }
+ it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->it.kaddr))
+ return PTR_ERR(it->it.kaddr);
+ it->it.blkaddr = blkaddr;
ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
if (ret != -ENOATTR)
break;
}
- if (vi->xattr_shared_count)
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_size;
}
static bool erofs_xattr_user_list(struct dentry *dentry)
{
- return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
}
static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -452,10 +405,11 @@ int erofs_getxattr(struct inode *inode, int index,
return ret;
it.index = index;
-
it.name.len = strlen(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
+
+ it.it.buf = __EROFS_BUF_INITIALIZER;
it.name.name = name;
it.buffer = buffer;
@@ -465,6 +419,7 @@ int erofs_getxattr(struct inode *inode, int index,
ret = inline_getxattr(inode, &it);
if (ret == -ENOATTR)
ret = shared_getxattr(inode, &it);
+ erofs_put_metabuf(&it.it.buf);
return ret;
}
@@ -476,7 +431,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
switch (handler->flags) {
case EROFS_XATTR_INDEX_USER:
- if (!test_opt(&sbi->ctx, XATTR_USER))
+ if (!test_opt(&sbi->opt, XATTR_USER))
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
@@ -607,7 +562,6 @@ static int inline_listxattr(struct listxattr_iter *it)
if (ret)
break;
}
- xattr_iter_end_final(&it->it);
return ret ? ret : it->buffer_ofs;
}
@@ -625,25 +579,16 @@ static int shared_listxattr(struct listxattr_iter *it)
xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
- if (!i || blkaddr != it->it.blkaddr) {
- if (i)
- xattr_iter_end(&it->it, true);
-
- it->it.page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(it->it.page))
- return PTR_ERR(it->it.page);
-
- it->it.kaddr = kmap_atomic(it->it.page);
- it->it.blkaddr = blkaddr;
- }
+ it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->it.kaddr))
+ return PTR_ERR(it->it.kaddr);
+ it->it.blkaddr = blkaddr;
ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
if (ret)
break;
}
- if (vi->xattr_shared_count)
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_ofs;
}
@@ -659,6 +604,7 @@ ssize_t erofs_listxattr(struct dentry *dentry,
if (ret)
return ret;
+ it.it.buf = __EROFS_BUF_INITIALIZER;
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
@@ -667,9 +613,10 @@ ssize_t erofs_listxattr(struct dentry *dentry,
it.it.sb = dentry->d_sb;
ret = inline_listxattr(&it);
- if (ret < 0 && ret != -ENOATTR)
- return ret;
- return shared_listxattr(&it);
+ if (ret >= 0 || ret == -ENOATTR)
+ ret = shared_listxattr(&it);
+ erofs_put_metabuf(&it.it.buf);
+ return ret;
}
#ifdef CONFIG_EROFS_FS_POSIX_ACL
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 94090c74b3f7..332462c59f11 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -86,4 +86,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu);
#endif
#endif
-
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 11c7a1aaebad..e6dea6dfca16 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -82,12 +82,13 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
{
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
int i;
for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
- if (pcl->pclusterpages > pcs->maxpages)
+ if (pclusterpages > pcs->maxpages)
continue;
kmem_cache_free(pcs->slab, pcl);
@@ -96,16 +97,9 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/*
- * a compressed_pages[] placeholder in order to avoid
- * being filled with file pages for in-place decompression.
- */
-#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
-
/* how to allocate cached pages for a pcluster */
enum z_erofs_cache_alloctype {
DONTALLOC, /* don't allocate any cached pages */
- DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
/*
* try to use cached I/O if page allocation succeeds or fallback
* to in-place I/O instead to avoid any direct reclaim.
@@ -198,7 +192,10 @@ enum z_erofs_collectmode {
COLLECT_PRIMARY_FOLLOWED,
};
-struct z_erofs_collector {
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+ struct erofs_map_blocks map;
+
struct z_erofs_pagevec_ctor vector;
struct z_erofs_pcluster *pcl, *tailpcl;
@@ -208,13 +205,6 @@ struct z_erofs_collector {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_collectmode mode;
-};
-
-struct z_erofs_decompress_frontend {
- struct inode *const inode;
-
- struct z_erofs_collector clt;
- struct erofs_map_blocks map;
bool readahead;
/* used for applying cache strategy on the fly */
@@ -222,30 +212,30 @@ struct z_erofs_decompress_frontend {
erofs_off_t headoffset;
};
-#define COLLECTOR_INIT() { \
- .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
-
#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .clt = COLLECTOR_INIT(), \
- .backmost = true, }
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = COLLECT_PRIMARY_FOLLOWED }
static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
static DEFINE_MUTEX(z_pagemap_global_lock);
-static void preload_compressed_pages(struct z_erofs_collector *clt,
- struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
+ enum z_erofs_cache_alloctype type,
+ struct page **pagepool)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
struct page **pages;
pgoff_t index;
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
return;
pages = pcl->compressed_pages;
@@ -267,10 +257,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
/* I/O is needed, no possible to decompress directly */
standalone = false;
switch (type) {
- case DELAYEDALLOC:
- t = tagptr_init(compressed_page_t,
- PAGE_UNALLOCATED);
- break;
case TRYALLOC:
newpage = erofs_allocpage(pagepool, gfp);
if (!newpage)
@@ -287,12 +273,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
continue;
- if (page) {
+ if (page)
put_page(page);
- } else if (newpage) {
- set_page_private(newpage, 0);
- list_add(&newpage->lru, pagepool);
- }
+ else if (newpage)
+ erofs_pagepool_add(pagepool, newpage);
}
/*
@@ -300,7 +284,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -311,6 +295,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
container_of(grp, struct z_erofs_pcluster, obj);
int i;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
* refcount of workgroup is now freezed as 1,
* therefore no need to worry about available decompression users.
@@ -344,6 +329,7 @@ int erofs_try_to_free_cached_page(struct page *page)
if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
unsigned int i;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
if (pcl->compressed_pages[i] == page) {
WRITE_ONCE(pcl->compressed_pages[i], NULL);
@@ -360,47 +346,47 @@ int erofs_try_to_free_cached_page(struct page *page)
}
/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct page *page)
{
- struct z_erofs_pcluster *const pcl = clt->pcl;
+ struct z_erofs_pcluster *const pcl = fe->pcl;
- while (clt->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--clt->icpage_ptr, NULL, page))
+ while (fe->icpage_ptr > pcl->compressed_pages)
+ if (!cmpxchg(--fe->icpage_ptr, NULL, page))
return true;
return false;
}
/* callers must be with collection lock held */
-static int z_erofs_attach_page(struct z_erofs_collector *clt,
- struct page *page,
- enum z_erofs_page_type type)
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+ struct page *page, enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
int ret;
/* give priority for inplaceio */
- if (clt->mode >= COLLECT_PRIMARY &&
+ if (fe->mode >= COLLECT_PRIMARY &&
type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(clt, page))
+ z_erofs_try_inplace_io(fe, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
- clt->cl->vcnt += (unsigned int)ret;
-
+ ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
+ pvec_safereuse);
+ fe->cl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
- z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = COLLECT_PRIMARY_FOLLOWED;
return;
}
@@ -411,24 +397,24 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- clt->mode = COLLECT_PRIMARY_HOOKED;
- clt->tailpcl = NULL;
+ f->mode = COLLECT_PRIMARY_HOOKED;
+ f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- clt->mode = COLLECT_PRIMARY;
+ f->mode = COLLECT_PRIMARY;
}
-static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
struct z_erofs_collection *cl;
unsigned int length;
/* to avoid unexpected loop formed by corrupted images */
- if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -459,43 +445,44 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
}
mutex_lock(&cl->lock);
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
- z_erofs_try_to_claim_pcluster(clt);
- clt->cl = cl;
+ z_erofs_try_to_claim_pcluster(fe);
+ fe->cl = cl;
return 0;
}
-static int z_erofs_register_collection(struct z_erofs_collector *clt,
+static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
+ bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
struct erofs_workgroup *grp;
int err;
+ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
/* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+ pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
+ map->m_plen >> PAGE_SHIFT);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
atomic_set(&pcl->obj.refcount, 1);
- pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+ pcl->algorithmformat = map->m_algorithmformat;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
(map->m_flags & EROFS_MAP_FULL_MAPPED ?
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
- if (map->m_flags & EROFS_MAP_ZIPPED)
- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
- else
- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
/* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = clt->owned_head;
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ pcl->next = fe->owned_head;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
cl->pageofs = map->m_la & ~PAGE_MASK;
@@ -507,23 +494,32 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
mutex_init(&cl->lock);
DBG_BUGON(!mutex_trylock(&cl->lock));
- grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
- }
+ if (ztailpacking) {
+ pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->pageofs_in = erofs_blkoff(map->m_pa);
+ pcl->tailpacking_size = map->m_plen;
+ } else {
+ pcl->obj.index = map->m_pa >> PAGE_SHIFT;
- if (grp != &pcl->obj) {
- clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
- err = -EEXIST;
- goto err_out;
+ grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
+ if (IS_ERR(grp)) {
+ err = PTR_ERR(grp);
+ goto err_out;
+ }
+
+ if (grp != &pcl->obj) {
+ fe->pcl = container_of(grp,
+ struct z_erofs_pcluster, obj);
+ err = -EEXIST;
+ goto err_out;
+ }
}
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->owned_head = &pcl->next;
- clt->pcl = pcl;
- clt->cl = cl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
+ fe->cl = cl;
return 0;
err_out:
@@ -532,48 +528,51 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
int ret;
- DBG_BUGON(clt->cl);
+ DBG_BUGON(fe->cl);
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (!PAGE_ALIGNED(map->m_pa)) {
- DBG_BUGON(1);
- return -EINVAL;
+ if (map->m_flags & EROFS_MAP_META) {
+ if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ goto tailpacking;
}
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
- clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
} else {
- ret = z_erofs_register_collection(clt, inode, map);
-
+tailpacking:
+ ret = z_erofs_register_collection(fe, inode, map);
if (!ret)
goto out;
if (ret != -EEXIST)
return ret;
}
- ret = z_erofs_lookup_collection(clt, inode, map);
+ ret = z_erofs_lookup_collection(fe, inode, map);
if (ret) {
- erofs_workgroup_put(&clt->pcl->obj);
+ erofs_workgroup_put(&fe->pcl->obj);
return ret;
}
out:
- z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- clt->cl->pagevec, clt->cl->vcnt);
-
+ z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+ fe->cl->pagevec, fe->cl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
+ fe->icpage_ptr = fe->pcl->compressed_pages +
+ z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -607,24 +606,24 @@ static void z_erofs_collection_put(struct z_erofs_collection *cl)
erofs_workgroup_put(&pcl->obj);
}
-static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = clt->cl;
+ struct z_erofs_collection *cl = fe->cl;
if (!cl)
return false;
- z_erofs_pagevec_ctor_exit(&clt->vector, false);
+ z_erofs_pagevec_ctor_exit(&fe->vector, false);
mutex_unlock(&cl->lock);
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
z_erofs_collection_put(cl);
- clt->cl = NULL;
+ fe->cl = NULL;
return true;
}
@@ -643,12 +642,11 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct list_head *pagepool)
+ struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
- struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
bool tight = true;
@@ -669,7 +667,7 @@ repeat:
if (offset + cur >= map->m_la &&
offset + cur < map->m_la + map->m_llen) {
/* didn't get a valid collection previously (very rare) */
- if (!clt->cl)
+ if (!fe->cl)
goto restart_now;
goto hitted;
}
@@ -677,7 +675,7 @@ repeat:
/* go ahead the next map_blocks */
erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
- if (z_erofs_collector_end(clt))
+ if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
@@ -690,19 +688,34 @@ restart_now:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
- err = z_erofs_collector_begin(clt, inode, map);
+ err = z_erofs_collector_begin(fe, inode, map);
if (err)
goto err_out;
- /* preload all compressed pages (maybe downgrade role if necessary) */
- if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
- cache_strategy = TRYALLOC;
- else
- cache_strategy = DONTALLOC;
+ if (z_erofs_is_inline_pcluster(fe->pcl)) {
+ void *mp;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
+ erofs_blknr(map->m_pa), EROFS_NO_KMAP);
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
+ erofs_err(inode->i_sb,
+ "failed to get inline page, err %d", err);
+ goto err_out;
+ }
+ get_page(fe->map.buf.page);
+ WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ } else {
+ /* bind cache first when cached decompression is preferred */
+ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
+ map->m_la))
+ cache_strategy = TRYALLOC;
+ else
+ cache_strategy = DONTALLOC;
+ z_erofs_bind_cache(fe, cache_strategy, pagepool);
+ }
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
@@ -710,8 +723,8 @@ hitted:
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or pagevec (should be processed in strict order.)
*/
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
- clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
+ fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -726,18 +739,19 @@ hitted:
Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
if (cur)
- tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type);
+ err = z_erofs_attach_page(fe, page, page_type,
+ fe->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
alloc_page(GFP_NOFS | __GFP_NOFAIL);
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(clt, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+ err = z_erofs_attach_page(fe, newpage,
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
}
@@ -752,7 +766,7 @@ retry:
/* bump up the number of spiltted parts of a page */
++spiltted;
/* also update nr_pages */
- clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+ fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
next_part:
/* can be used for verification */
map->m_llen = offset + cur - map->m_la;
@@ -774,32 +788,19 @@ err_out:
goto out;
}
-static void z_erofs_decompressqueue_work(struct work_struct *work);
-static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
- bool sync, int bios)
+static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+ unsigned int readahead_pages)
{
- struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
-
- /* wake up the caller thread for sync decompression */
- if (sync) {
- unsigned long flags;
+ /* auto: enable for readpage, disable for readahead */
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
+ !readahead_pages)
+ return true;
- spin_lock_irqsave(&io->u.wait.lock, flags);
- if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
- return;
- }
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
+ (readahead_pages <= sbi->opt.max_sync_decompress_pages))
+ return true;
- if (atomic_add_return(bios, &io->pending_bios))
- return;
- /* Use workqueue and sync decompression for atomic contexts only */
- if (in_atomic() || irqs_disabled()) {
- queue_work(z_erofs_workqueue, &io->u.work);
- sbi->ctx.readahead_sync_decompress = true;
- return;
- }
- z_erofs_decompressqueue_work(&io->u.work);
+ return false;
}
static bool z_erofs_page_is_invalidated(struct page *page)
@@ -807,38 +808,12 @@ static bool z_erofs_page_is_invalidated(struct page *page)
return !page->mapping && !z_erofs_is_shortlived_page(page);
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
-{
- tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
- struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
- blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
-
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- if (err)
- SetPageError(page);
-
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
- }
- z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
- bio_put(bio);
-}
-
static int z_erofs_decompress_pcluster(struct super_block *sb,
struct z_erofs_pcluster *pcl,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
struct z_erofs_pagevec_ctor ctor;
unsigned int i, inputsize, outputsize, llen, nr_pages;
struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
@@ -920,15 +895,20 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
overlapped = false;
compressed_pages = pcl->compressed_pages;
- for (i = 0; i < pcl->pclusterpages; ++i) {
+ for (i = 0; i < pclusterpages; ++i) {
unsigned int pagenr;
page = compressed_pages[i];
-
/* all compressed pages ought to be valid */
DBG_BUGON(!page);
- DBG_BUGON(z_erofs_page_is_invalidated(page));
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ if (!PageUptodate(page))
+ err = -EIO;
+ continue;
+ }
+
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
if (erofs_page_is_managed(sbi, page)) {
if (!PageUptodate(page))
@@ -973,11 +953,16 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
partial = true;
}
- inputsize = pcl->pclusterpages * PAGE_SIZE;
+ if (z_erofs_is_inline_pcluster(pcl))
+ inputsize = pcl->tailpacking_size;
+ else
+ inputsize = pclusterpages * PAGE_SIZE;
+
err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
.sb = sb,
.in = compressed_pages,
.out = pages,
+ .pageofs_in = pcl->pageofs_in,
.pageofs_out = cl->pageofs,
.inputsize = inputsize,
.outputsize = outputsize,
@@ -987,17 +972,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
}, pagepool);
out:
- /* must handle all compressed pages before ending pages */
- for (i = 0; i < pcl->pclusterpages; ++i) {
- page = compressed_pages[i];
-
- if (erofs_page_is_managed(sbi, page))
- continue;
+ /* must handle all compressed pages before actual file pages */
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ page = compressed_pages[0];
+ WRITE_ONCE(compressed_pages[0], NULL);
+ put_page(page);
+ } else {
+ for (i = 0; i < pclusterpages; ++i) {
+ page = compressed_pages[i];
- /* recycle all individual short-lived pages */
- (void)z_erofs_put_shortlivedpage(pagepool, page);
+ if (erofs_page_is_managed(sbi, page))
+ continue;
- WRITE_ONCE(compressed_pages[i], NULL);
+ /* recycle all individual short-lived pages */
+ (void)z_erofs_put_shortlivedpage(pagepool, page);
+ WRITE_ONCE(compressed_pages[i], NULL);
+ }
}
for (i = 0; i < nr_pages; ++i) {
@@ -1036,7 +1026,7 @@ out:
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct list_head *pagepool)
+ struct page **pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -1060,22 +1050,48 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
z_erofs_decompress_queue(bgq, &pagepool);
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
kvfree(bgq);
}
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ bool sync, int bios)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
+ /* wake up the caller thread for sync decompression */
+ if (sync) {
+ if (!atomic_add_return(bios, &io->pending_bios))
+ complete(&io->u.done);
+
+ return;
+ }
+
+ if (atomic_add_return(bios, &io->pending_bios))
+ return;
+ /* Use workqueue and sync decompression for atomic contexts only */
+ if (in_atomic() || irqs_disabled()) {
+ queue_work(z_erofs_workqueue, &io->u.work);
+ /* enable sync decompression for readahead */
+ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
+}
+
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
- struct list_head *pagepool,
- struct address_space *mc,
- gfp_t gfp)
+ struct page **pagepool,
+ struct address_space *mc)
{
const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct address_space *mapping;
@@ -1091,15 +1107,6 @@ repeat:
if (!page)
goto out_allocpage;
- /*
- * the cached page has not been allocated and
- * an placeholder is out there, prepare it now.
- */
- if (page == PAGE_UNALLOCATED) {
- tocache = true;
- goto out_allocpage;
- }
-
/* process the target tagged pointer */
t = tagptr_init(compressed_page_t, page);
justfound = tagptr_unfold_tags(t);
@@ -1173,7 +1180,7 @@ repeat:
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
}
@@ -1207,7 +1214,7 @@ jobqueue_init(struct super_block *sb,
} else {
fg_out:
q = fgq;
- init_waitqueue_head(&fgq->u.wait);
+ init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
}
q->sb = sb;
@@ -1255,9 +1262,36 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
+static void z_erofs_decompressqueue_endio(struct bio *bio)
+{
+ tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+ struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
+ blk_status_t err = bio->bi_status;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+
+ if (err)
+ SetPageError(page);
+
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ }
+ z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
+ bio_put(bio);
+}
+
static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool,
+ struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
@@ -1265,9 +1299,10 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
- z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
- /* since bio will be NULL, no need to initialize last_index */
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
+ struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
@@ -1279,6 +1314,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
q[JQ_SUBMIT]->head = owned_head;
do {
+ struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
pgoff_t cur, end;
unsigned int i = 0;
@@ -1290,37 +1326,47 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- cur = pcl->obj.index;
- end = cur + pcl->pclusterpages;
-
/* close the main owned chain at first */
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ continue;
+ }
+
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = blknr_to_addr(pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(mdev.m_pa);
+ end = cur + pcl->pclusterpages;
do {
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
+ MNGD_MAPPING(sbi));
if (!page)
continue;
- if (bio && cur != last_index + 1) {
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
bio = NULL;
}
if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
+
+ last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
if (f->readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
@@ -1355,11 +1401,11 @@ submit_bio_retry:
static void z_erofs_runqueue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool, bool force_fg)
+ struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
@@ -1370,38 +1416,105 @@ static void z_erofs_runqueue(struct super_block *sb,
return;
/* wait until all bios are completed */
- io_wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac,
+ erofs_off_t end,
+ struct page **pagepool,
+ bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur;
+ int err;
+
+ if (backmost) {
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expend ra for the trailing edge if readahead */
+ if (rac) {
+ loff_t newstart = readahead_pos(rac);
+
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, newstart, cur - newstart);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while (cur >= end) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (!page)
+ goto skip;
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto skip;
+ }
+
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ put_page(page);
+skip:
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *pagepool = NULL;
int err;
- LIST_HEAD(pagepool);
trace_erofs_readpage(page, false);
-
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+ &pagepool, true);
err = z_erofs_do_read_page(&f, page, &pagepool);
- (void)z_erofs_collector_end(&f.clt);
+ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
+ (void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_get_sync_decompress_policy(sbi, 0));
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
- if (f.map.mpage)
- put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&pagepool);
return err;
}
@@ -1409,29 +1522,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- unsigned int nr_pages = readahead_count(rac);
- bool sync = (sbi->ctx.readahead_sync_decompress &&
- nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *page, *head = NULL;
- LIST_HEAD(pagepool);
-
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ struct page *pagepool = NULL, *head = NULL, *page;
+ unsigned int nr_pages;
f.readahead = true;
f.headoffset = readahead_pos(rac);
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
-
- /*
- * A pure asynchronous readahead is indicated if
- * a PG_readahead marked page is hitted at first.
- * Let's also do asynchronous decompression for this case.
- */
- sync &= !(PageReadahead(page) && !head);
+ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+ readahead_length(rac) - 1, &pagepool, true);
+ nr_pages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1450,16 +1553,13 @@ static void z_erofs_readahead(struct readahead_control *rac)
page->index, EROFS_I(inode)->nid);
put_page(page);
}
+ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+ (void)z_erofs_collector_end(&f);
- (void)z_erofs_collector_end(&f.clt);
-
- z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
- if (f.map.mpage)
- put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 3a008f1b9f78..800b11c53f57 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -62,8 +62,16 @@ struct z_erofs_pcluster {
/* A: lower limit of decompressed length and if full length or not */
unsigned int length;
- /* I: physical cluster size in pages */
- unsigned short pclusterpages;
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
+
+ union {
+ /* I: physical cluster size in pages */
+ unsigned short pclusterpages;
+
+ /* I: tailpacking inline compressed size */
+ unsigned short tailpacking_size;
+ };
/* I: compression algorithm format */
unsigned char algorithmformat;
@@ -89,16 +97,21 @@ struct z_erofs_decompressqueue {
z_erofs_next_pcluster_t head;
union {
- wait_queue_head_t wait;
+ struct completion done;
struct work_struct work;
} u;
};
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
+static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return page->mapping == MNGD_MAPPING(sbi);
+ return !pcl->obj.index;
+}
+
+static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
+{
+ if (z_erofs_is_inline_pcluster(pcl))
+ return 1;
+ return pcl->pclusterpages;
}
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
@@ -186,4 +199,3 @@ static inline void z_erofs_onlinepage_endio(struct page *page)
#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
#endif
-
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7a6df35fdc91..572f0b8151ba 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -7,12 +7,17 @@
#include <asm/unaligned.h>
#include <trace/events/erofs.h>
+static int z_erofs_do_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags);
+
int z_erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!erofs_sb_has_big_pcluster(sbi) &&
+ !erofs_sb_has_ztailpacking(sbi) &&
vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
@@ -28,9 +33,9 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err;
+ int err, headnr;
erofs_off_t pos;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
void *kaddr;
struct z_erofs_map_header *h;
@@ -51,26 +56,28 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ !erofs_sb_has_ztailpacking(EROFS_SB(sb)) &&
vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
- page = erofs_get_meta_page(sb, erofs_blknr(pos));
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
goto out_unlock;
}
- kaddr = kmap_atomic(page);
-
h = kaddr + erofs_blkoff(pos);
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
- vi->z_algorithmtype[0], vi->nid);
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
goto unmap_done;
}
@@ -92,13 +99,33 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
err = -EFSCORRUPTED;
goto unmap_done;
}
+unmap_done:
+ erofs_put_metabuf(&buf);
+ if (err)
+ goto out_unlock;
+
+ if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+
+ if (!map.m_plen ||
+ erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map.m_plen);
+ err = -EFSCORRUPTED;
+ }
+ if (err < 0)
+ goto out_unlock;
+ }
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-unmap_done:
- kunmap_atomic(kaddr);
- unlock_page(page);
- put_page(page);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
@@ -111,41 +138,22 @@ struct z_erofs_maprecorder {
unsigned long lcn;
/* compression extent information gathered */
- u8 type;
+ u8 type, headtype;
u16 clusterofs;
u16 delta[2];
erofs_blk_t pblk, compressedlcs;
+ erofs_off_t nextpackoff;
};
static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
erofs_blk_t eblk)
{
struct super_block *const sb = m->inode->i_sb;
- struct erofs_map_blocks *const map = m->map;
- struct page *mpage = map->mpage;
-
- if (mpage) {
- if (mpage->index == eblk) {
- if (!m->kaddr)
- m->kaddr = kmap_atomic(mpage);
- return 0;
- }
-
- if (m->kaddr) {
- kunmap_atomic(m->kaddr);
- m->kaddr = NULL;
- }
- put_page(mpage);
- }
- mpage = erofs_get_meta_page(sb, eblk);
- if (IS_ERR(mpage)) {
- map->mpage = NULL;
- return PTR_ERR(mpage);
- }
- m->kaddr = kmap_atomic(mpage);
- unlock_page(mpage);
- map->mpage = mpage;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
return 0;
}
@@ -167,6 +175,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
if (err)
return err;
+ m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
m->lcn = lcn;
di = m->kaddr + erofs_blkoff(pos);
@@ -178,7 +187,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
- if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -189,7 +199,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -239,12 +250,12 @@ static int get_compacted_la_distance(unsigned int lclusterbits,
static int unpack_compacted_index(struct z_erofs_maprecorder *m,
unsigned int amortizedshift,
- unsigned int eofs, bool lookahead)
+ erofs_off_t pos, bool lookahead)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
const unsigned int lomask = (1 << lclusterbits) - 1;
- unsigned int vcnt, base, lo, encodebits, nblk;
+ unsigned int vcnt, base, lo, encodebits, nblk, eofs;
int i;
u8 *in, type;
bool big_pcluster;
@@ -256,8 +267,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ /* it doesn't equal to round_up(..) */
+ m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
+ (vcnt << amortizedshift);
big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+ eofs = erofs_blkoff(pos);
base = round_down(eofs, vcnt << amortizedshift);
in = m->kaddr + base;
@@ -395,8 +410,7 @@ out:
err = z_erofs_reload_indexes(m, erofs_blknr(pos));
if (err)
return err;
- return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos),
- lookahead);
+ return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
@@ -417,48 +431,47 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn = m->lcn;
- int err;
- if (lcn < lookback_distance) {
- erofs_err(m->inode->i_sb,
- "bogus lookback distance @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
- /* load extent head logical cluster if needed */
- lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
- if (err)
- return err;
+ /* load extent head logical cluster if needed */
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lookback_distance = m->delta[0];
+ continue;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
DBG_BUGON(1);
- return -EFSCORRUPTED;
+ return -EOPNOTSUPP;
}
- return z_erofs_extent_lookback(m, m->delta[0]);
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
- map->m_la = (lcn << lclusterbits) | m->clusterofs;
- break;
- default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- return 0;
+
+ erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
@@ -471,13 +484,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
int err;
DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
- if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
- map->m_plen = 1 << lclusterbits;
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ map->m_plen = 1ULL << lclusterbits;
return 0;
}
-
lcn = m->lcn + 1;
if (m->compressedlcs)
goto out;
@@ -499,7 +517,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
switch (m->type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -520,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedlcs << lclusterbits;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
@@ -554,7 +573,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
} else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
@@ -572,11 +592,12 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return 0;
}
-int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags)
+static int z_erofs_do_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
{
struct erofs_inode *const vi = EROFS_I(inode);
+ bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
@@ -586,22 +607,8 @@ int z_erofs_map_blocks_iter(struct inode *inode,
unsigned long initial_lcn;
unsigned long long ofs, end;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
-
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
- map->m_llen = map->m_la + 1 - inode->i_size;
- map->m_la = inode->i_size;
- map->m_flags = 0;
- goto out;
- }
-
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
-
lclusterbits = vi->z_logical_clusterbits;
- ofs = map->m_la;
+ ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
@@ -609,17 +616,26 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto unmap_out;
- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
+ vi->z_idataoff = m.nextpackoff;
+
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
switch (m.type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- if (endoff >= m.clusterofs)
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
break;
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
@@ -649,27 +665,68 @@ int z_erofs_map_blocks_iter(struct inode *inode,
}
map->m_llen = end - map->m_la;
- map->m_pa = blknr_to_addr(m.pblk);
- map->m_flags |= EROFS_MAP_MAPPED;
- err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
- if (err)
- goto out;
+ if (flags & EROFS_GET_BLOCKS_FINDTAIL)
+ vi->z_tailextent_headlcn = m.lcn;
+ if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_META;
+ map->m_pa = vi->z_idataoff;
+ map->m_plen = vi->z_idata_size;
+ } else {
+ map->m_pa = blknr_to_addr(m.pblk);
+ err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+ if (err)
+ goto out;
+ }
- if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ map->m_algorithmformat = vi->z_algorithmtype[1];
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+ map->m_llen >= EROFS_BLKSIZ)) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
unmap_out:
- if (m.kaddr)
- kunmap_atomic(m.kaddr);
+ erofs_unmap_metabuf(&m.map->buf);
out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
+ return err;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
+{
+ int err = 0;
+
+ trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+ /* when trying to read beyond EOF, leave it unmapped */
+ if (map->m_la >= inode->i_size) {
+ map->m_llen = map->m_la + 1 - inode->i_size;
+ map->m_la = inode->i_size;
+ map->m_flags = 0;
+ goto out;
+ }
+
+ err = z_erofs_fill_inode_lazy(inode);
+ if (err)
+ goto out;
+
+ err = z_erofs_do_map_blocks(inode, map, flags);
+out:
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
/* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
@@ -685,8 +742,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
struct erofs_map_blocks map = { .m_la = offset };
ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
- if (map.mpage)
- put_page(map.mpage);
+ erofs_put_metabuf(&map.buf);
if (ret < 0)
return ret;
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index dfd7fe0503bb..b05464f4a808 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -106,11 +106,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
struct page *page,
- enum z_erofs_page_type type)
+ enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
- if (!ctor->next && type)
- if (ctor->index + 1 == ctor->nr)
+ if (!ctor->next) {
+ /* some pages cannot be reused as pvec safely without I/O */
+ if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
+ type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
+
+ if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+ ctor->index + 1 == ctor->nr)
return false;
+ }
if (ctor->index >= ctor->nr)
z_erofs_pagevec_ctor_pagedown(ctor, false);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 06f4c5ae1451..e2daa940ebce 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head)
static long long_zero;
static long long_max = LONG_MAX;
-struct ctl_table epoll_table[] = {
+static struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
@@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = {
},
{ }
};
+
+static void __init epoll_sysctls_init(void)
+{
+ register_sysctl("fs/epoll", epoll_table);
+}
+#else
+#define epoll_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
static const struct file_operations eventpoll_fops;
@@ -2378,6 +2385,7 @@ static int __init eventpoll_init(void)
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ epoll_sysctls_init();
ephead_cache = kmem_cache_create("ep_head",
sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
diff --git a/fs/exec.c b/fs/exec.c
index a098c133d8d7..e3e55d5e0be1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,7 +56,6 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
-#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
@@ -65,6 +64,7 @@
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
+#include <linux/coredump.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -117,7 +117,7 @@ bool path_noexec(const struct path *path)
* Note that a shared library must be both readable and executable due to
* security reasons.
*
- * Also note that we take the address to load from from the file itself.
+ * Also note that we take the address to load from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
@@ -494,8 +494,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -535,7 +541,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (!valid_arg_len(bprm, len))
goto out;
- /* We're going to work our way backwords. */
+ /* We're going to work our way backwards. */
pos = bprm->p;
str += len;
bprm->p -= len;
@@ -987,16 +993,14 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
/*
- * Make sure that if there is a core dump in progress
- * for the old mm, we get out and die instead of going
- * through with the exec. We must hold mmap_lock around
- * checking core_state and changing tsk->mm.
+ * If there is a pending fatal signal perhaps a signal
+ * whose default action is to create a coredump get
+ * out and die instead of going through with the exec.
*/
- mmap_read_lock(old_mm);
- if (unlikely(old_mm->core_state)) {
- mmap_read_unlock(old_mm);
+ ret = mmap_read_lock_killable(old_mm);
+ if (ret) {
up_write(&tsk->signal->exec_update_lock);
- return -EINTR;
+ return ret;
}
}
@@ -1047,7 +1051,7 @@ static int de_thread(struct task_struct *tsk)
* Kill all other threads in the thread group.
*/
spin_lock_irq(lock);
- if (signal_group_exit(sig)) {
+ if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
/*
* Another group action in progress, just
* return so that the signal is processed.
@@ -1056,7 +1060,7 @@ static int de_thread(struct task_struct *tsk)
return -EAGAIN;
}
- sig->group_exit_task = tsk;
+ sig->group_exec_task = tsk;
sig->notify_count = zap_other_threads(tsk);
if (!thread_group_leader(tsk))
sig->notify_count--;
@@ -1084,7 +1088,7 @@ static int de_thread(struct task_struct *tsk)
write_lock_irq(&tasklist_lock);
/*
* Do this under tasklist_lock to ensure that
- * exit_notify() can't miss ->group_exit_task
+ * exit_notify() can't miss ->group_exec_task
*/
sig->notify_count = -1;
if (likely(leader->exit_state))
@@ -1151,7 +1155,7 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}
- sig->group_exit_task = NULL;
+ sig->group_exec_task = NULL;
sig->notify_count = 0;
no_thread_group:
@@ -1164,7 +1168,7 @@ no_thread_group:
killed:
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
- sig->group_exit_task = NULL;
+ sig->group_exec_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock);
return -EAGAIN;
@@ -1209,7 +1213,8 @@ static int unshare_sighand(struct task_struct *me)
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
task_lock(tsk);
- strncpy(buf, tsk->comm, buf_size);
+ /* Always NUL terminated and zero-padded */
+ strscpy_pad(buf, tsk->comm, buf_size);
task_unlock(tsk);
return buf;
}
@@ -1224,7 +1229,7 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
- strlcpy(tsk->comm, buf, sizeof(tsk->comm));
+ strscpy_pad(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
perf_event_comm(tsk, exec);
}
@@ -1269,7 +1274,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Must be called _before_ exec_mmap() as bprm->mm is
- * not visibile until then. This also enables the update
+ * not visible until then. This also enables the update
* to be lockless.
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
@@ -1303,12 +1308,8 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- /*
- * Ensure that the uaccess routines can actually operate on userspace
- * pointers:
- */
- force_uaccess_begin();
-
+ if (me->flags & PF_KTHREAD)
+ free_kthread_struct(me);
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
@@ -1852,7 +1853,7 @@ out:
* SIGSEGV.
*/
if (bprm->point_of_no_return && !fatal_signal_pending(current))
- force_sigsegv(SIGSEGV);
+ force_fatal_sig(SIGSEGV);
out_unmark:
current->fs->in_exec = 0;
@@ -1895,6 +1896,9 @@ static int do_execveat_common(int fd, struct filename *filename,
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1921,6 +1925,19 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1949,6 +1966,8 @@ int kernel_execve(const char *kernel_filename,
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -2098,3 +2117,37 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
argv, envp, flags);
}
#endif
+
+#ifdef CONFIG_SYSCTL
+
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
+static struct ctl_table fs_exec_sysctls[] = {
+ {
+ .procname = "suid_dumpable",
+ .data = &suid_dumpable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_coredump,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+
+static int __init init_fs_exec_sysctls(void)
+{
+ register_sysctl_init("fs", fs_exec_sysctls);
+ return 0;
+}
+
+fs_initcall(init_fs_exec_sysctls);
+#endif /* CONFIG_SYSCTL */
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index cc5cffc4a769..03f142307174 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -105,7 +105,7 @@ int exfat_load_bitmap(struct super_block *sb)
struct exfat_dentry *ep;
struct buffer_head *bh;
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index cb1c0d8c1714..a27b55ec060a 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -64,7 +64,6 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
{
int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
unsigned int type, clu_offset, max_dentries;
- sector_t sector;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
struct exfat_dentry *ep;
@@ -115,7 +114,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
i = dentry & (dentries_per_clu - 1);
for ( ; i < dentries_per_clu; i++, dentry++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh, &sector);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
@@ -156,7 +155,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
dir_entry->namebuf.lfnbuf_len);
brelse(bh);
- ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i + 1, &bh);
if (!ep)
return -EIO;
dir_entry->size =
@@ -445,7 +444,6 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct timespec64 ts = current_time(inode);
- sector_t sector;
struct exfat_dentry *ep;
struct buffer_head *bh;
@@ -453,7 +451,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
* We cannot use exfat_get_dentry_set here because file ep is not
* initialized yet.
*/
- ep = exfat_get_dentry(sb, p_dir, entry, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry, &bh);
if (!ep)
return -EIO;
@@ -477,7 +475,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
if (!ep)
return -EIO;
@@ -496,12 +494,11 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
struct super_block *sb = inode->i_sb;
int ret = 0;
int i, num_entries;
- sector_t sector;
u16 chksum;
struct exfat_dentry *ep, *fep;
struct buffer_head *fbh, *bh;
- fep = exfat_get_dentry(sb, p_dir, entry, &fbh, &sector);
+ fep = exfat_get_dentry(sb, p_dir, entry, &fbh);
if (!fep)
return -EIO;
@@ -509,7 +506,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
for (i = 1; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL);
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
if (!ep) {
ret = -EIO;
goto release_fbh;
@@ -531,13 +528,12 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
{
struct super_block *sb = inode->i_sb;
int i;
- sector_t sector;
unsigned short *uniname = p_uniname->name;
struct exfat_dentry *ep;
struct buffer_head *bh;
int sync = IS_DIRSYNC(inode);
- ep = exfat_get_dentry(sb, p_dir, entry, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry, &bh);
if (!ep)
return -EIO;
@@ -545,7 +541,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
exfat_update_bh(bh, sync);
brelse(bh);
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
if (!ep)
return -EIO;
@@ -555,7 +551,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
brelse(bh);
for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
if (!ep)
return -EIO;
@@ -574,12 +570,11 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
{
struct super_block *sb = inode->i_sb;
int i;
- sector_t sector;
struct exfat_dentry *ep;
struct buffer_head *bh;
for (i = order; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, &sector);
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
if (!ep)
return -EIO;
@@ -656,8 +651,8 @@ static int exfat_walk_fat_chain(struct super_block *sb,
return 0;
}
-int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, sector_t *sector, int *offset)
+static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, sector_t *sector, int *offset)
{
int ret;
unsigned int off, clu = 0;
@@ -717,8 +712,7 @@ static int exfat_dir_readahead(struct super_block *sb, sector_t sec)
}
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, struct buffer_head **bh,
- sector_t *sector)
+ struct exfat_chain *p_dir, int entry, struct buffer_head **bh)
{
unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE);
int off;
@@ -740,8 +734,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
if (!*bh)
return NULL;
- if (sector)
- *sector = sec;
return (struct exfat_dentry *)((*bh)->b_data + off);
}
@@ -892,7 +884,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
es->bh[es->num_bh++] = bh;
}
- /* validiate cached dentries */
+ /* validate cached dentries */
for (i = 1; i < num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
@@ -960,7 +952,7 @@ rewind:
if (rewind && dentry == end_eidx)
goto not_found;
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
@@ -1145,7 +1137,7 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
struct buffer_head *bh;
for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) {
- ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL);
+ ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh);
if (!ext_ep)
return -EIO;
@@ -1175,7 +1167,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
while (clu.dir != EXFAT_EOF_CLUSTER) {
for (i = 0; i < dentries_per_clu; i++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
entry_type = exfat_get_entry_type(ep);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 1d6da61157c9..c6800b880920 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -10,7 +10,6 @@
#include <linux/ratelimit.h>
#include <linux/nls.h>
-#define EXFAT_SUPER_MAGIC 0x2011BAB0UL
#define EXFAT_ROOT_INO 1
#define EXFAT_CLUSTERS_UNTRACKED (~0u)
@@ -204,7 +203,8 @@ struct exfat_mount_options {
/* on error: continue, panic, remount-ro */
enum exfat_error_mode errors;
unsigned utf8:1, /* Use of UTF-8 character set */
- discard:1; /* Issue discard requests on deletions */
+ discard:1, /* Issue discard requests on deletions */
+ keep_last_dots:1; /* Keep trailing periods in paths */
int time_offset; /* Offset of timestamps from UTC (in minutes) */
};
@@ -459,11 +459,8 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
int num_entries, unsigned int type, struct exfat_hint *hint_opt);
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
-int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, sector_t *sector, int *offset);
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, struct buffer_head **bh,
- sector_t *sector);
+ struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index e949e563443c..a3464e56a7e1 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -84,9 +84,7 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc,
static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
unsigned int clus)
{
- if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus)
- return false;
- return true;
+ return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
}
int exfat_ent_get(struct super_block *sb, unsigned int loc,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 6af0191b648f..2f5130059236 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -110,8 +110,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
exfat_set_volume_dirty(sb);
num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
- num_clusters_phys =
- EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi);
+ num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
@@ -219,8 +218,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (exfat_free_cluster(inode, &clu))
return -EIO;
- exfat_clear_volume_dirty(sb);
-
return 0;
}
@@ -228,12 +225,13 @@ void exfat_truncate(struct inode *inode, loff_t size)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
unsigned int blocksize = i_blocksize(inode);
loff_t aligned_size;
int err;
mutex_lock(&sbi->s_lock);
- if (EXFAT_I(inode)->start_clu == 0) {
+ if (ei->start_clu == 0) {
/*
* Empty start_clu != ~0 (not allocated)
*/
@@ -251,8 +249,8 @@ void exfat_truncate(struct inode *inode, loff_t size)
else
mark_inode_dirty(inode);
- inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
- ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
+ inode->i_blkbits;
write_size:
aligned_size = i_size_read(inode);
if (aligned_size & (blocksize - 1)) {
@@ -260,11 +258,11 @@ write_size:
aligned_size++;
}
- if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode))
- EXFAT_I(inode)->i_size_ondisk = aligned_size;
+ if (ei->i_size_ondisk > i_size_read(inode))
+ ei->i_size_ondisk = aligned_size;
- if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode))
- EXFAT_I(inode)->i_size_aligned = aligned_size;
+ if (ei->i_size_aligned > i_size_read(inode))
+ ei->i_size_aligned = aligned_size;
mutex_unlock(&sbi->s_lock);
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index ca37d4344361..fc0ea1684880 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -31,7 +31,7 @@ static int __exfat_write_inode(struct inode *inode, int sync)
return 0;
/*
- * If the indode is already unlinked, there is no need for updating it.
+ * If the inode is already unlinked, there is no need for updating it.
*/
if (ei->dir.dir == DIR_DELETED)
return 0;
@@ -114,10 +114,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
unsigned int local_clu_offset = clu_offset;
unsigned int num_to_be_allocated = 0, num_clusters = 0;
- if (EXFAT_I(inode)->i_size_ondisk > 0)
+ if (ei->i_size_ondisk > 0)
num_clusters =
- EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk,
- sbi);
+ EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
if (clu_offset >= num_clusters)
num_to_be_allocated = clu_offset - num_clusters + 1;
@@ -416,10 +415,10 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
- if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) {
+ if (ei->i_size_aligned < i_size_read(inode)) {
exfat_fs_error(inode->i_sb,
"invalid size(size(%llu) > aligned(%llu)\n",
- i_size_read(inode), EXFAT_I(inode)->i_size_aligned);
+ i_size_read(inode), ei->i_size_aligned);
return -EIO;
}
@@ -491,7 +490,8 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations exfat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = exfat_readpage,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
@@ -603,8 +603,8 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
- inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
- ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
+ inode->i_blkbits;
inode->i_mtime = info->mtime;
inode->i_ctime = info->mtime;
ei->i_crtime = info->crtime;
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index d34e6193258d..d5bd8e6d9741 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
+#include <linux/blk_types.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -180,7 +181,7 @@ int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync)
set_buffer_uptodate(bhs[i]);
mark_buffer_dirty(bhs[i]);
if (sync)
- write_dirty_buffer(bhs[i], 0);
+ write_dirty_buffer(bhs[i], REQ_SYNC);
}
for (i = 0; i < nr_bhs && sync; i++) {
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 24b41103d1cc..a02a04a993bf 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -65,11 +65,14 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
return ret;
}
-/* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int exfat_striptail_len(unsigned int len, const char *name)
+/* returns the length of a struct qstr, ignoring trailing dots if necessary */
+static unsigned int exfat_striptail_len(unsigned int len, const char *name,
+ bool keep_last_dots)
{
- while (len && name[len - 1] == '.')
- len--;
+ if (!keep_last_dots) {
+ while (len && name[len - 1] == '.')
+ len--;
+ }
return len;
}
@@ -83,7 +86,8 @@ static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
struct super_block *sb = dentry->d_sb;
struct nls_table *t = EXFAT_SB(sb)->nls_io;
const unsigned char *name = qstr->name;
- unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
unsigned long hash = init_name_hash(dentry);
int i, charlen;
wchar_t c;
@@ -104,8 +108,10 @@ static int exfat_d_cmp(const struct dentry *dentry, unsigned int len,
{
struct super_block *sb = dentry->d_sb;
struct nls_table *t = EXFAT_SB(sb)->nls_io;
- unsigned int alen = exfat_striptail_len(name->len, name->name);
- unsigned int blen = exfat_striptail_len(len, str);
+ unsigned int alen = exfat_striptail_len(name->len, name->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
+ unsigned int blen = exfat_striptail_len(len, str,
+ EXFAT_SB(sb)->options.keep_last_dots);
wchar_t c1, c2;
int charlen, i;
@@ -136,7 +142,8 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr)
{
struct super_block *sb = dentry->d_sb;
const unsigned char *name = qstr->name;
- unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
unsigned long hash = init_name_hash(dentry);
int i, charlen;
unicode_t u;
@@ -161,8 +168,11 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct super_block *sb = dentry->d_sb;
- unsigned int alen = exfat_striptail_len(name->len, name->name);
- unsigned int blen = exfat_striptail_len(len, str);
+ unsigned int alen = exfat_striptail_len(name->len, name->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
+ unsigned int blen = exfat_striptail_len(len, str,
+ EXFAT_SB(sb)->options.keep_last_dots);
+
unicode_t u_a, u_b;
int charlen, i;
@@ -229,7 +239,7 @@ static int exfat_search_empty_slot(struct super_block *sb,
i = dentry & (dentries_per_clu - 1);
for (; i < dentries_per_clu; i++, dentry++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
type = exfat_get_entry_type(ep);
@@ -306,7 +316,6 @@ static int exfat_find_empty_entry(struct inode *inode,
{
int dentry;
unsigned int ret, last_clu;
- sector_t sector;
loff_t size = 0;
struct exfat_chain clu;
struct exfat_dentry *ep = NULL;
@@ -379,7 +388,7 @@ static int exfat_find_empty_entry(struct inode *inode,
struct buffer_head *bh;
ep = exfat_get_dentry(sb,
- &(ei->dir), ei->entry + 1, &bh, &sector);
+ &(ei->dir), ei->entry + 1, &bh);
if (!ep)
return -EIO;
@@ -395,9 +404,9 @@ static int exfat_find_empty_entry(struct inode *inode,
/* directory inode should be updated in here */
i_size_write(inode, size);
- EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size;
- EXFAT_I(inode)->i_size_aligned += sbi->cluster_size;
- EXFAT_I(inode)->flags = p_dir->flags;
+ ei->i_size_ondisk += sbi->cluster_size;
+ ei->i_size_aligned += sbi->cluster_size;
+ ei->flags = p_dir->flags;
inode->i_blocks += 1 << sbi->sect_per_clus_bits;
}
@@ -417,13 +426,25 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
+ int pathlen = strlen(path);
- /* strip all trailing periods */
- namelen = exfat_striptail_len(strlen(path), path);
+ /*
+ * get the length of the pathname excluding
+ * trailing periods, if any.
+ */
+ namelen = exfat_striptail_len(pathlen, path, false);
+ if (EXFAT_SB(sb)->options.keep_last_dots) {
+ /*
+ * Do not allow the creation of files with names
+ * ending with period(s).
+ */
+ if (!lookup && (namelen < pathlen))
+ return -EINVAL;
+ namelen = pathlen;
+ }
if (!namelen)
return -ENOENT;
-
- if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
+ if (pathlen > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
return -ENAMETOOLONG;
/*
@@ -555,7 +576,6 @@ static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir,
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
&info);
- exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -779,7 +799,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct exfat_inode_info *ei = EXFAT_I(inode);
struct buffer_head *bh;
- sector_t sector;
int num_entries, entry, err = 0;
mutex_lock(&EXFAT_SB(sb)->s_lock);
@@ -791,7 +810,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh, &sector);
+ ep = exfat_get_dentry(sb, &cdir, entry, &bh);
if (!ep) {
err = -EIO;
goto unlock;
@@ -814,7 +833,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
- exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -848,7 +866,6 @@ static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
&info);
- exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -895,7 +912,7 @@ static int exfat_check_dir_empty(struct super_block *sb,
while (clu.dir != EXFAT_EOF_CLUSTER) {
for (i = 0; i < dentries_per_clu; i++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
type = exfat_get_entry_type(ep);
@@ -932,7 +949,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
struct buffer_head *bh;
- sector_t sector;
int num_entries, entry, err;
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
@@ -957,7 +973,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh, &sector);
+ ep = exfat_get_dentry(sb, &cdir, entry, &bh);
if (!ep) {
err = -EIO;
goto unlock;
@@ -979,7 +995,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
ei->dir.dir = DIR_DELETED;
- exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -1005,13 +1020,12 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
struct exfat_inode_info *ei)
{
int ret, num_old_entries, num_new_entries;
- sector_t sector_old, sector_new;
struct exfat_dentry *epold, *epnew;
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh, *old_bh;
int sync = IS_DIRSYNC(inode);
- epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, &sector_old);
+ epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh);
if (!epold)
return -EIO;
@@ -1032,8 +1046,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
if (newentry < 0)
return newentry; /* -EIO or -ENOSPC */
- epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh,
- &sector_new);
+ epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh);
if (!epnew)
return -EIO;
@@ -1046,12 +1059,10 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
brelse(old_bh);
brelse(new_bh);
- epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh,
- &sector_old);
+ epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh);
if (!epold)
return -EIO;
- epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh,
- &sector_new);
+ epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh);
if (!epnew) {
brelse(old_bh);
return -EIO;
@@ -1093,12 +1104,11 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei)
{
int ret, newentry, num_new_entries, num_old_entries;
- sector_t sector_mov, sector_new;
struct exfat_dentry *epmov, *epnew;
struct super_block *sb = inode->i_sb;
struct buffer_head *mov_bh, *new_bh;
- epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, &sector_mov);
+ epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh);
if (!epmov)
return -EIO;
@@ -1116,7 +1126,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
if (newentry < 0)
return newentry; /* -EIO or -ENOSPC */
- epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, &sector_new);
+ epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh);
if (!epnew)
return -EIO;
@@ -1129,12 +1139,10 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
brelse(mov_bh);
brelse(new_bh);
- epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh,
- &sector_mov);
+ epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh);
if (!epmov)
return -EIO;
- epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh,
- &sector_new);
+ epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh);
if (!epnew) {
brelse(mov_bh);
return -EIO;
@@ -1216,7 +1224,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
exfat_chain_dup(&olddir, &ei->dir);
dentry = ei->entry;
- ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL);
+ ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
if (!ep) {
ret = -EIO;
goto out;
@@ -1237,7 +1245,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
p_dir = &(new_ei->dir);
new_entry = new_ei->entry;
- ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL);
+ ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
if (!ep)
goto out;
@@ -1277,7 +1285,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
if (!ret && new_inode) {
/* delete entries of new_dir */
- ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL);
+ ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
if (!ep) {
ret = -EIO;
goto del_out;
@@ -1321,7 +1329,6 @@ del_out:
*/
new_ei->dir.dir = DIR_DELETED;
}
- exfat_clear_volume_dirty(sb);
out:
return ret;
}
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 314d5407a1be..ef115e673406 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -761,7 +761,7 @@ int exfat_create_upcase_table(struct super_block *sb)
while (clu.dir != EXFAT_EOF_CLUSTER) {
for (i = 0; i < sbi->dentries_per_clu; i++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
if (!ep)
return -EIO;
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 5539ffc20d16..8ca21e7917d1 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -17,6 +17,7 @@
#include <linux/iversion.h>
#include <linux/nls.h>
#include <linux/buffer_head.h>
+#include <linux/magic.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -99,7 +100,6 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
- bool sync;
/* retain persistent-flags */
new_flags |= sbi->vol_flags_persistent;
@@ -118,16 +118,11 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
p_boot->vol_flags = cpu_to_le16(new_flags);
- if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh))
- sync = true;
- else
- sync = false;
-
set_buffer_uptodate(sbi->boot_bh);
mark_buffer_dirty(sbi->boot_bh);
- if (sync)
- sync_dirty_buffer(sbi->boot_bh);
+ __sync_dirty_buffer(sbi->boot_bh, REQ_SYNC | REQ_FUA | REQ_PREFLUSH);
+
return 0;
}
@@ -173,6 +168,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",errors=remount-ro");
if (opts->discard)
seq_puts(m, ",discard");
+ if (opts->keep_last_dots)
+ seq_puts(m, ",keep_last_dots");
if (opts->time_offset)
seq_printf(m, ",time_offset=%d", opts->time_offset);
return 0;
@@ -182,7 +179,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb)
{
struct exfat_inode_info *ei;
- ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -216,6 +213,7 @@ enum {
Opt_charset,
Opt_errors,
Opt_discard,
+ Opt_keep_last_dots,
Opt_time_offset,
/* Deprecated options */
@@ -242,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_string("iocharset", Opt_charset),
fsparam_enum("errors", Opt_errors, exfat_param_enums),
fsparam_flag("discard", Opt_discard),
+ fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_s32("time_offset", Opt_time_offset),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
@@ -296,6 +295,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_discard:
opts->discard = 1;
break;
+ case Opt_keep_last_dots:
+ opts->keep_last_dots = 1;
+ break;
case Opt_time_offset:
/*
* Make the limit 24 just in case someone invents something
@@ -364,11 +366,11 @@ static int exfat_read_root(struct inode *inode)
inode->i_op = &exfat_dir_inode_operations;
inode->i_fop = &exfat_dir_operations;
- inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1))
- & ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
- EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
- EXFAT_I(inode)->i_size_aligned = i_size_read(inode);
- EXFAT_I(inode)->i_size_ondisk = i_size_read(inode);
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
+ inode->i_blkbits;
+ ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
+ ei->i_size_aligned = i_size_read(inode);
+ ei->i_size_ondisk = i_size_read(inode);
exfat_save_attr(inode, ATTR_SUBDIR);
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3be9dd6412b7..d4f306aa5ace 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,6 +118,7 @@ struct ext2_sb_info {
spinlock_t s_lock;
struct mb_cache *s_ea_block_cache;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
};
static inline spinlock_t *
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index df14e750e9fe..998dd2ac8008 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long offset;
unsigned long block;
struct ext2_group_desc * gdp;
- struct backing_dev_info *bdi;
-
- bdi = inode_to_bdi(inode);
- if (bdi_rw_congested(bdi))
- return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 333fa62661d5..52377a0ee735 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -36,6 +36,7 @@
#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
+#include <linux/dax.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
@@ -816,9 +817,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64)first_block << blkbits;
- iomap->dax_dev = sbi->s_daxdev;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = sbi->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;
if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -827,6 +830,8 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64)bno << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += sbi->s_dax_part_off;
iomap->length = (u64)ret << blkbits;
iomap->flags |= IOMAP_F_MERGED;
}
@@ -962,7 +967,8 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
}
const struct address_space_operations ext2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
@@ -977,7 +983,8 @@ const struct address_space_operations ext2_aops = {
};
const struct address_space_operations ext2_nobh_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
@@ -993,8 +1000,7 @@ const struct address_space_operations ext2_nobh_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
/*
@@ -1297,9 +1303,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
if (IS_DAX(inode)) {
- error = iomap_zero_range(inode, newsize,
- PAGE_ALIGN(newsize) - newsize, NULL,
- &ext2_iomap_ops);
+ error = dax_zero_range(inode, newsize,
+ PAGE_ALIGN(newsize) - newsize, NULL,
+ &ext2_iomap_ops);
} else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d8d580b609ba..f6a19f6d9f6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
@@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits)
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
res += 1LL << (3*(bits-2));
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
- if (res < upper_limit)
+ if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
@@ -802,7 +806,6 @@ static unsigned long descriptor_loc(struct super_block *sb,
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
@@ -822,17 +825,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- goto failed;
+ return -ENOMEM;
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
if (!sbi->s_blockgroup_lock) {
kfree(sbi);
- goto failed;
+ return -ENOMEM;
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = dax_dev;
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -946,11 +949,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (test_opt(sb, DAX)) {
- if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
- bdev_nr_sectors(sb->s_bdev))) {
+ if (!sbi->s_daxdev) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
clear_opt(sbi->s_mount_opt, DAX);
+ } else if (blocksize != PAGE_SIZE) {
+ ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
+ clear_opt(sbi->s_mount_opt, DAX);
}
}
@@ -1199,11 +1204,10 @@ failed_mount_group_desc:
failed_mount:
brelse(bh);
failed_sbi:
+ fs_put_dax(sbi->s_daxdev);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
-failed:
- fs_put_dax(dax_dev);
return ret;
}
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0613dfcbfd4a..57e82e25f8e2 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -139,7 +139,7 @@ fail:
/*
* Inode operation get_posix_acl().
*
- * inode->i_mutex: don't care
+ * inode->i_rwsem: don't care
*/
struct posix_acl *
ext4_get_acl(struct inode *inode, int type, bool rcu)
@@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu)
/*
* Set the access or default ACL of an inode.
*
- * inode->i_mutex: down unless called from ext4_new_inode
+ * inode->i_rwsem: down unless called from ext4_new_inode
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -246,7 +246,6 @@ retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
- ext4_fc_start_update(inode);
if ((type == ACL_TYPE_ACCESS) && acl) {
error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl);
@@ -264,7 +263,6 @@ retry:
}
out_stop:
ext4_journal_stop(handle);
- ext4_fc_stop_update(inode);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
return error;
@@ -273,8 +271,8 @@ out_stop:
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
+ * dir->i_rwsem: down
+ * inode->i_rwsem: up (access to inode is still exclusive)
*/
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a0fb0c4bdc7c..78ee3ef795ae 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -411,6 +411,7 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 4666b55b736e..5504f72bbbbe 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -292,15 +292,10 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with some other filesystem metadata blocks.
- */
-int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
- unsigned int count)
+int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks;
struct ext4_system_zone *entry;
struct rb_node *n;
@@ -329,7 +324,9 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else {
- ret = (entry->ino == inode->i_ino);
+ ret = 0;
+ if (inode)
+ ret = (entry->ino == inode->i_ino);
break;
}
}
@@ -338,6 +335,17 @@ out_rcu:
return ret;
}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 74b172a4adda..a6bb86f52b9a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -303,7 +303,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
goto done;
brelse(bh);
bh = NULL;
- offset = 0;
}
done:
err = 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3825195539d7..a743b1e3b89e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -17,6 +17,7 @@
#ifndef _EXT4_H
#define _EXT4_H
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
@@ -241,7 +242,7 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
- atomic_t count; /* reference counter */
+ refcount_t count; /* reference counter */
struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
@@ -1027,7 +1028,7 @@ struct ext4_inode_info {
/*
* Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
+ * data. Taking i_rwsem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
@@ -1045,6 +1046,8 @@ struct ext4_inode_info {
/* Fast commit related info */
+ /* For tracking dentry create updates */
+ struct list_head i_fc_dilist;
struct list_head i_fc_list; /*
* inodes that need fast commit
* protected by sbi->s_fc_lock.
@@ -1278,7 +1281,7 @@ struct ext4_inode_info {
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
-extern void ext4_set_bits(void *bm, int cur, int len);
+extern void mb_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -1297,6 +1300,8 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
+#define EXT4_LABEL_MAX 16
+
/*
* Structure of the super block
*/
@@ -1346,7 +1351,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[16]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
@@ -1660,7 +1665,7 @@ struct ext4_sb_info {
struct task_struct *s_mmp_tsk;
/* record the last minlen when FITRIM is called. */
- atomic_t s_last_trim_minblks;
+ unsigned long s_last_trim_minblks;
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
@@ -1696,6 +1701,7 @@ struct ext4_sb_info {
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
@@ -1724,9 +1730,9 @@ struct ext4_sb_info {
*/
struct work_struct s_error_work;
- /* Ext4 fast commit stuff */
+ /* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
- atomic_t s_fc_ineligible_updates;
+
/*
* After commit starts, the main queue gets locked, and the further
* updates get added in the staging queue.
@@ -1746,7 +1752,7 @@ struct ext4_sb_info {
spinlock_t s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
- u64 s_fc_avg_commit_time;
+ tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
#endif
@@ -1792,10 +1798,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
enum {
EXT4_MF_MNTDIR_SAMPLED,
EXT4_MF_FS_ABORTED, /* Fatal error detected */
- EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
- EXT4_MF_FC_COMMITTING /* File system underoing a fast
- * commit.
- */
+ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -2270,6 +2273,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -2398,8 +2405,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
- if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
- BUG();
+ BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
@@ -2483,7 +2489,7 @@ struct ext4_filename {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct fscrypt_str cf_name;
#endif
};
@@ -2719,7 +2725,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
const struct qstr *iname,
struct ext4_filename *fname);
@@ -2752,7 +2758,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
return err;
@@ -2771,7 +2777,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
#endif
return err;
@@ -2788,7 +2794,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
@@ -2804,7 +2810,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
@@ -2820,7 +2826,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
@@ -2924,9 +2930,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
-void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
-void ext4_fc_start_ineligible(struct super_block *sb, int reason);
-void ext4_fc_stop_ineligible(struct super_block *sb);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
@@ -2934,6 +2938,10 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
+void ext4_fc_destroy_dentry_cache(void);
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ int len, int replay);
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
@@ -3028,7 +3036,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3060,6 +3068,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
@@ -3095,6 +3104,9 @@ extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+extern unsigned int ext4_list_backups(struct super_block *sb,
+ unsigned int *three, unsigned int *five,
+ unsigned int *seven);
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
@@ -3109,6 +3121,8 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
+extern __le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3401,7 +3415,7 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
-/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
@@ -3412,7 +3426,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
int changed = 0;
@@ -3700,6 +3714,9 @@ extern int ext4_inode_block_valid(struct inode *inode,
unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
+extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count);
+
/* extents.c */
struct ext4_ext_path;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6def7339056d..3477a16d08ae 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -162,6 +162,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
{
if (!ext4_handle_valid(handle))
return 0;
+ if (is_handle_aborted(handle))
+ return -EROFS;
if (jbd2_handle_buffer_credits(handle) >= check_cred &&
handle->h_revoke_credits >= revoke_cred)
return 0;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0e4fa644df01..db2ae4a2b38d 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
- * i_mutex for direct I/O reads. This only works for extent-based
+ * i_rwsem for direct I/O reads. This only works for extent-based
* files, and it doesn't work if data journaling is enabled, since the
* dioread_nolock code uses b_private to pass information back to the
* I/O completion handler, and this conflicts with the jbd's use of
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e02571f2f82..e473fde6b64b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -27,8 +27,8 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
-#include <linux/backing-dev.h>
#include <linux/iomap.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode, 0);
@@ -136,15 +136,25 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, inode->i_sb,
- path->p_bh, EXT4_JTR_NONE);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -165,6 +175,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -354,9 +367,13 @@ static int ext4_valid_extent_idx(struct inode *inode,
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
- ext4_fsblk_t *pblk, int depth)
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -365,31 +382,51 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
+ prev = lblock + ext4_ext_get_actual_len(ext) - 1;
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if ((lblock <= prev) && prev) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ prev = lblock;
}
}
return 1;
@@ -397,7 +434,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -423,7 +460,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -453,7 +490,7 @@ corrupted:
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -486,16 +523,18 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
+ pblk = ext4_idx_pblock(idx);
bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -508,8 +547,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
if (err)
goto errout;
set_buffer_verified(bh);
@@ -527,8 +566,8 @@ errout:
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -578,8 +617,7 @@ int ext4_ext_precache(struct inode *inode)
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -714,13 +752,14 @@ ext4_ext_binsearch_idx(struct inode *inode,
r = EXT_LAST_INDEX(eh);
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
+
if (block < le32_to_cpu(m->ei_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
- r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
@@ -782,13 +821,14 @@ ext4_ext_binsearch(struct inode *inode,
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
+
if (block < le32_to_cpu(m->ee_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
- r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
@@ -884,8 +924,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -1457,8 +1496,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
- EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
depth);
return -EFSCORRUPTED;
}
@@ -1494,7 +1532,6 @@ static int ext4_ext_search_right(struct inode *inode,
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1561,20 +1598,17 @@ got_index:
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1990,7 +2024,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -2019,7 +2052,6 @@ prepend:
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -2953,9 +2985,9 @@ again:
ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -3336,7 +3368,6 @@ static int ext4_split_extent(handle_t *handle,
return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
- split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -4372,8 +4403,7 @@ retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry;
}
if (err)
@@ -4381,8 +4411,7 @@ retry:
retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry_remove_space;
}
return err;
@@ -4471,9 +4500,9 @@ retry:
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
@@ -4542,9 +4571,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4612,8 +4645,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
- (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
@@ -4662,10 +4693,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- ext4_fc_start_update(inode);
-
if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(inode, offset, len);
+ ret = ext4_punch_hole(file, offset, len);
goto exit;
}
@@ -4674,12 +4703,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto exit;
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(inode, offset, len);
+ ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(inode, offset, len);
+ ret = ext4_insert_range(file, offset, len);
goto exit;
}
@@ -4712,9 +4741,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
@@ -4727,7 +4760,6 @@ out:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
- ext4_fc_stop_update(inode);
return ret;
}
@@ -4978,36 +5010,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
-{
- int credits, err;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- /*
- * Check if need to extend journal credits
- * 3 for leaf, sb, and inode plus 2 (bmap and group
- * descriptor) for each block group; assume two block
- * groups
- */
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
- if (err < 0)
- return err;
-
- err = ext4_ext_get_access(handle, inode, path);
- return err;
-}
-
-/*
* ext4_ext_shift_path_extents:
* Shift the extents of a path structure lying between path[depth].p_ext
* and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
@@ -5021,6 +5023,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
bool update = false;
+ int credits, restart_credits;
depth = path->p_depth;
while (depth >= 0) {
@@ -5030,13 +5033,26 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ /* leaf + sb + inode */
+ credits = 3;
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ update = true;
+ /* extent tree + sb + inode */
+ credits = depth + 2;
+ }
- err = ext4_access_path(handle, inode, path + depth);
- if (err)
+ restart_credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ restart_credits, 0);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = true;
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5067,7 +5083,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
}
/* Update index too */
- err = ext4_access_path(handle, inode, path + depth);
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
@@ -5106,6 +5122,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
int ret = 0, depth;
struct ext4_extent *extent;
ext4_lblk_t stop, *iterator, ex_start, ex_end;
+ ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5159,11 +5176,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* till we reach stop. In case of right shift, iterator points to stop
* and it is decreased till we reach start.
*/
+again:
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS)
+ *iterator = tmp;
+
/*
* Its safe to start updating extents. Start and stop are unsigned, so
* in case of right shift if extent with 0 block is reached, iterator
@@ -5192,6 +5213,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
}
+ tmp = *iterator;
if (SHIFT == SHIFT_LEFT) {
extent = EXT_LAST_EXTENT(path[depth].p_hdr);
*iterator = le32_to_cpu(extent->ee_block) +
@@ -5210,6 +5232,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, SHIFT);
+ /* iterator can be NULL which means we should break */
+ if (ret == -EAGAIN)
+ goto again;
if (ret)
break;
}
@@ -5224,8 +5249,9 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
@@ -5277,6 +5303,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5316,7 +5346,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
@@ -5355,7 +5385,6 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
- ext4_fc_stop_ineligible(sb);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
@@ -5371,8 +5400,9 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
@@ -5429,6 +5459,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5457,7 +5491,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
@@ -5532,7 +5566,6 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
- ext4_fc_stop_ineligible(sb);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
@@ -5555,7 +5588,7 @@ out_mutex:
* stuff such as page-cache locking consistency, bh mapping consistency or
* extent's data copying must be performed by caller.
* Locking:
- * i_mutex is held for both inodes
+ * i_rwsem is held for both inodes
* i_data_sem is locked for write for both inodes
* Assumptions:
* All pages from requested range are locked for both inodes
@@ -6043,6 +6076,9 @@ int ext4_ext_clear_bb(struct inode *inode)
int j, ret = 0;
struct ext4_map_blocks map;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ return 0;
+
/* Determin the size of the file first */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
EXT4_EX_NOCACHE);
@@ -6072,11 +6108,15 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ 0, path[j].p_block, 1, 1);
}
ext4_ext_drop_refs(path);
kfree(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ map.m_lblk, map.m_pblk, map.m_len, 1);
}
cur = cur + map.m_len;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 8ea5a81e6554..3d72565ec6e8 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -65,21 +65,11 @@
*
* Fast Commit Ineligibility
* -------------------------
- * Not all operations are supported by fast commits today (e.g extended
- * attributes). Fast commit ineligibility is marked by calling one of the
- * two following functions:
- *
- * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
- * back to full commit. This is useful in case of transient errors.
*
- * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
- * the fast commits happening between ext4_fc_start_ineligible() and
- * ext4_fc_stop_ineligible() and one fast commit after the call to
- * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
- * make one more fast commit to fall back to full commit after stop call so
- * that it guaranteed that the fast commit ineligible operation contained
- * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
- * followed by at least 1 full commit.
+ * Not all operations are supported by fast commits today (e.g extended
+ * attributes). Fast commit ineligibility is marked by calling
+ * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
+ * to full commit.
*
* Atomicity of commits
* --------------------
@@ -166,15 +156,13 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
- * eligible update must be protected within ext4_fc_start_update() and
- * ext4_fc_stop_update(). These routines are called at much higher
- * routines. This can be made more fine grained by combining with
- * ext4_journal_start().
+ * 1) Fast commit's commit path locks the entire file system during fast
+ * commit. This has significant performance penalty. Instead of that, we
+ * should use ext4_fc_start/stop_update functions to start inode level
+ * updates from ext4_journal_start/stop. Once we do that we can drop file
+ * system locking during commit path.
*
- * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
- *
- * 3) Handle more ineligible cases.
+ * 2) Handle more ineligible cases.
*/
#include <trace/events/ext4.h>
@@ -211,6 +199,7 @@ void ext4_fc_init_inode(struct inode *inode)
ext4_fc_reset_inode(inode);
ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
INIT_LIST_HEAD(&ei->i_fc_list);
+ INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
atomic_set(&ei->i_fc_updates, 0);
}
@@ -291,6 +280,8 @@ void ext4_fc_stop_update(struct inode *inode)
void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_fc_dentry_update *fc_dentry;
if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
@@ -298,7 +289,7 @@ void ext4_fc_del(struct inode *inode)
restart:
spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list)) {
+ if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}
@@ -307,63 +298,64 @@ restart:
ext4_fc_wait_committing_inode(inode);
goto restart;
}
- list_del_init(&ei->i_fc_list);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-/*
- * Mark file system as fast commit ineligible. This means that next commit
- * operation would result in a full jbd2 commit.
- */
-void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ if (!list_empty(&ei->i_fc_list))
+ list_del_init(&ei->i_fc_list);
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ /*
+ * Since this inode is getting removed, let's also remove all FC
+ * dentry create references, since it is not needed to log it anyways.
+ */
+ if (list_empty(&ei->i_fc_dilist)) {
+ spin_unlock(&sbi->s_fc_lock);
return;
+ }
- ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- WARN_ON(reason >= EXT4_FC_REASON_MAX);
- sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
-}
+ fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
-/*
- * Start a fast commit ineligible update. Any commits that happen while
- * such an operation is in progress fall back to full commits.
- */
-void ext4_fc_start_ineligible(struct super_block *sb, int reason)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ spin_unlock(&sbi->s_fc_lock);
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
- return;
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- WARN_ON(reason >= EXT4_FC_REASON_MAX);
- sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
- atomic_inc(&sbi->s_fc_ineligible_updates);
+ return;
}
/*
- * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
- * to ensure that after stopping the ineligible update, at least one full
- * commit takes place.
+ * Mark file system as fast commit ineligible, and record latest
+ * ineligible transaction tid. This means until the recorded
+ * transaction, commit operation would result in a full jbd2 commit.
*/
-void ext4_fc_stop_ineligible(struct super_block *sb)
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ tid_t tid;
+
if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
-}
-
-static inline int ext4_fc_is_ineligible(struct super_block *sb)
-{
- return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
- atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
+ if (handle && !IS_ERR(handle))
+ tid = handle->h_transaction->t_tid;
+ else {
+ read_lock(&sbi->s_journal->j_state_lock);
+ tid = sbi->s_journal->j_running_transaction ?
+ sbi->s_journal->j_running_transaction->t_tid : 0;
+ read_unlock(&sbi->s_journal->j_state_lock);
+ }
+ spin_lock(&sbi->s_fc_lock);
+ if (sbi->s_fc_ineligible_tid < tid)
+ sbi->s_fc_ineligible_tid = tid;
+ spin_unlock(&sbi->s_fc_lock);
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
/*
@@ -387,13 +379,6 @@ static int ext4_fc_track_template(
tid_t tid = 0;
int ret;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return -EOPNOTSUPP;
-
- if (ext4_fc_is_ineligible(inode->i_sb))
- return -EINVAL;
-
tid = handle->h_transaction->t_tid;
mutex_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
@@ -411,7 +396,8 @@ static int ext4_fc_track_template(
spin_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
- (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
+ (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
spin_unlock(&sbi->s_fc_lock);
@@ -437,7 +423,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
mutex_unlock(&ei->i_fc_lock);
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -450,7 +436,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM);
+ EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -462,13 +448,28 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.name = node->fcd_iname;
}
node->fcd_name.len = dentry->d_name.len;
-
+ INIT_LIST_HEAD(&node->fcd_dilist);
spin_lock(&sbi->s_fc_lock);
- if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
+ if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+
+ /*
+ * This helps us keep a track of all fc_dentry updates which is part of
+ * this ext4 inode. So in case the inode is getting unlinked, before
+ * even we get a chance to fsync, we could remove all fc_dentry
+ * references while evicting the inode in ext4_fc_del().
+ * Also with this, we don't need to loop over all the inodes in
+ * sbi->s_fc_q to get the corresponding inode in
+ * ext4_fc_commit_dentry_updates().
+ */
+ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
+ }
spin_unlock(&sbi->s_fc_lock);
mutex_lock(&ei->i_fc_lock);
@@ -486,12 +487,22 @@ void __ext4_fc_track_unlink(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_unlink(inode, dentry, ret);
+ trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_unlink(handle, inode, dentry);
}
void __ext4_fc_track_link(handle_t *handle,
@@ -505,12 +516,22 @@ void __ext4_fc_track_link(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_link(inode, dentry, ret);
+ trace_ext4_fc_track_link(handle, inode, dentry, ret);
}
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_link(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_link(handle, inode, dentry);
}
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
@@ -524,12 +545,22 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_create(inode, dentry, ret);
+ trace_ext4_fc_track_create(handle, inode, dentry, ret);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_create(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_create(handle, inode, dentry);
}
/* __track_fn for inode tracking */
@@ -545,6 +576,7 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
@@ -552,12 +584,19 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_INODE_JOURNAL_DATA);
+ EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
return;
}
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
- trace_ext4_fc_track_inode(inode, ret);
+ trace_ext4_fc_track_inode(handle, inode, ret);
}
struct __track_range_args {
@@ -595,18 +634,26 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
args.start = start;
args.end = end;
ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
- trace_ext4_fc_track_range(inode, start, end, ret);
+ trace_ext4_fc_track_range(handle, inode, start, end, ret);
}
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
@@ -796,7 +843,6 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
dst += sizeof(fcd);
ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
- dst += dlen;
return true;
}
@@ -819,7 +865,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
if (ret)
return ret;
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ inode_len = EXT4_INODE_SIZE(inode->i_sb);
+ else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
inode_len += ei->i_extra_isize;
fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
@@ -928,7 +976,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
int ret = 0;
spin_lock(&sbi->s_fc_lock);
- ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
while (atomic_read(&ei->i_fc_updates)) {
@@ -988,7 +1035,7 @@ __releases(&sbi->s_fc_lock)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
- struct ext4_inode_info *ei, *ei_n;
+ struct ext4_inode_info *ei;
int ret;
if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
@@ -1004,21 +1051,16 @@ __releases(&sbi->s_fc_lock)
spin_lock(&sbi->s_fc_lock);
continue;
}
-
- inode = NULL;
- list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
- inode = &ei->vfs_inode;
- break;
- }
- }
/*
- * If we don't find inode in our list, then it was deleted,
- * in which case, we don't need to record it's create tag.
+ * With fcd_dilist we need not loop in sbi->s_fc_q to get the
+ * corresponding inode pointer
*/
- if (!inode)
- continue;
+ WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
+
spin_unlock(&sbi->s_fc_lock);
/*
@@ -1121,6 +1163,33 @@ out:
return ret;
}
+static void ext4_fc_update_stats(struct super_block *sb, int status,
+ u64 commit_time, int nblks, tid_t commit_tid)
+{
+ struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
+
+ jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+ status, commit_tid);
+ if (status == EXT4_FC_STATUS_OK) {
+ stats->fc_num_commits++;
+ stats->fc_numblks += nblks;
+ if (likely(stats->s_fc_avg_commit_time))
+ stats->s_fc_avg_commit_time =
+ (commit_time +
+ stats->s_fc_avg_commit_time * 3) / 4;
+ else
+ stats->s_fc_avg_commit_time = commit_time;
+ } else if (status == EXT4_FC_STATUS_FAILED ||
+ status == EXT4_FC_STATUS_INELIGIBLE) {
+ if (status == EXT4_FC_STATUS_FAILED)
+ stats->fc_failed_commits++;
+ stats->fc_ineligible_commits++;
+ } else {
+ stats->fc_skipped_commits++;
+ }
+ trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
+}
+
/*
* The main commit entry point. Performs a fast commit for transaction
* commit_tid if needed. If it's not possible to perform a fast commit
@@ -1133,18 +1202,15 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int nblks = 0, ret, bsize = journal->j_blocksize;
int subtid = atomic_read(&sbi->s_fc_subtid);
- int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
+ int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
- trace_ext4_fc_commit_start(sb);
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return jbd2_complete_transaction(journal, commit_tid);
- start_time = ktime_get();
+ trace_ext4_fc_commit_start(sb, commit_tid);
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (ext4_fc_is_ineligible(sb))) {
- reason = EXT4_FC_REASON_INELIGIBLE;
- goto out;
- }
+ start_time = ktime_get();
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
@@ -1153,74 +1219,61 @@ restart_fc:
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
commit_tid > journal->j_commit_sequence)
goto restart_fc;
- reason = EXT4_FC_REASON_ALREADY_COMMITTED;
- goto out;
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
+ commit_tid);
+ return 0;
} else if (ret) {
- sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
- reason = EXT4_FC_REASON_FC_START_FAILED;
- goto out;
+ /*
+ * Commit couldn't start. Just update stats and perform a
+ * full commit.
+ */
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
+ commit_tid);
+ return jbd2_complete_transaction(journal, commit_tid);
+ }
+
+ /*
+ * After establishing journal barrier via jbd2_fc_begin_commit(), check
+ * if we are fast commit ineligible.
+ */
+ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
+ status = EXT4_FC_STATUS_INELIGIBLE;
+ goto fallback;
}
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
- sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
- reason = EXT4_FC_REASON_FC_FAILED;
- goto out;
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
}
nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
ret = jbd2_fc_wait_bufs(journal, nblks);
if (ret < 0) {
- sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
- reason = EXT4_FC_REASON_FC_FAILED;
- goto out;
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
}
atomic_inc(&sbi->s_fc_subtid);
- jbd2_fc_end_commit(journal);
-out:
- /* Has any ineligible update happened since we started? */
- if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
- sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
- reason = EXT4_FC_REASON_INELIGIBLE;
- }
-
- spin_lock(&sbi->s_fc_lock);
- if (reason != EXT4_FC_REASON_OK &&
- reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
- sbi->s_fc_stats.fc_ineligible_commits++;
- } else {
- sbi->s_fc_stats.fc_num_commits++;
- sbi->s_fc_stats.fc_numblks += nblks;
- }
- spin_unlock(&sbi->s_fc_lock);
- nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
- trace_ext4_fc_commit_stop(sb, nblks, reason);
- commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ ret = jbd2_fc_end_commit(journal);
/*
- * weight the commit time higher than the average time so we don't
- * react too strongly to vast changes in the commit time
+ * weight the commit time higher than the average time so we
+ * don't react too strongly to vast changes in the commit time
*/
- if (likely(sbi->s_fc_avg_commit_time))
- sbi->s_fc_avg_commit_time = (commit_time +
- sbi->s_fc_avg_commit_time * 3) / 4;
- else
- sbi->s_fc_avg_commit_time = commit_time;
- jbd_debug(1,
- "Fast commit ended with blks = %d, reason = %d, subtid - %d",
- nblks, reason, subtid);
- if (reason == EXT4_FC_REASON_FC_FAILED)
- return jbd2_fc_end_commit_fallback(journal);
- if (reason == EXT4_FC_REASON_FC_START_FAILED ||
- reason == EXT4_FC_REASON_INELIGIBLE)
- return jbd2_complete_transaction(journal, commit_tid);
- return 0;
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
+ return ret;
+
+fallback:
+ ret = jbd2_fc_end_commit_fallback(journal);
+ ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
+ return ret;
}
/*
* Fast commit cleanup routine. This is called after every fast commit and
* full commit. full is true if we are called after a full commit.
*/
-static void ext4_fc_cleanup(journal_t *journal, int full)
+static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1230,6 +1283,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
+ trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
spin_lock(&sbi->s_fc_lock);
@@ -1238,7 +1292,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (iter->i_sync_tid <= tid)
+ ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
#if (BITS_PER_LONG < 64)
@@ -1253,6 +1308,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
struct ext4_fc_dentry_update,
fcd_list);
list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
spin_unlock(&sbi->s_fc_lock);
if (fc_dentry->fcd_name.name &&
@@ -1267,8 +1323,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (tid >= sbi->s_fc_ineligible_tid) {
+ sbi->s_fc_ineligible_tid = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ }
if (full)
sbi->s_fc_bytes = 0;
@@ -1433,14 +1491,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes_size +=
- EXT4_FC_REPLAY_REALLOC_INCREMENT;
state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes, sizeof(int) *
- state->fc_modified_inodes_size,
- GFP_KERNEL);
+ state->fc_modified_inodes,
+ sizeof(int) * (state->fc_modified_inodes_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
if (!state->fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
return 0;
@@ -1472,7 +1531,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
}
inode = NULL;
- ext4_fc_record_modified_inode(sb, ino);
+ ret = ext4_fc_record_modified_inode(sb, ino);
+ if (ret)
+ goto out;
raw_fc_inode = (struct ext4_inode *)
(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
@@ -1524,7 +1585,8 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
* crashing. This should be fixed but until then, we calculate
* the number of blocks the inode.
*/
- ext4_ext_replay_set_iblocks(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ ext4_ext_replay_set_iblocks(inode);
inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
ext4_reset_inode_seed(inode);
@@ -1603,16 +1665,23 @@ out:
}
/*
- * Record physical disk regions which are in use as per fast commit area. Our
- * simple replay phase allocator excludes these regions from allocation.
+ * Record physical disk regions which are in use as per fast commit area,
+ * and used by inodes during replay phase. Our simple replay phase
+ * allocator excludes these regions from allocation.
*/
-static int ext4_fc_record_regions(struct super_block *sb, int ino,
- ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
struct ext4_fc_replay_state *state;
struct ext4_fc_alloc_region *region;
state = &EXT4_SB(sb)->s_fc_replay_state;
+ /*
+ * during replay phase, the fc_regions_valid may not same as
+ * fc_regions_used, update it when do new additions.
+ */
+ if (replay && state->fc_regions_used != state->fc_regions_valid)
+ state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
@@ -1630,6 +1699,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino,
region->pblk = pblk;
region->len = len;
+ if (replay)
+ state->fc_regions_valid++;
+
return 0;
}
@@ -1661,6 +1733,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
}
ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
start = le32_to_cpu(ex->ee_block);
start_pblk = ext4_ext_pblock(ex);
@@ -1678,18 +1752,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
map.m_pblk = 0;
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0) {
- iput(inode);
- return 0;
- }
+ if (ret < 0)
+ goto out;
if (ret == 0) {
/* Range is not mapped */
path = ext4_find_extent(inode, cur, NULL, 0);
- if (IS_ERR(path)) {
- iput(inode);
- return 0;
- }
+ if (IS_ERR(path))
+ goto out;
memset(&newex, 0, sizeof(newex));
newex.ee_block = cpu_to_le32(cur);
ext4_ext_store_pblock(
@@ -1703,10 +1773,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
up_write((&EXT4_I(inode)->i_data_sem));
ext4_ext_drop_refs(path);
kfree(path);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
goto next;
}
@@ -1719,10 +1787,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex),
start_pblk + cur - start);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
/*
* Mark the old blocks as free since they aren't used
* anymore. We maintain an array of all the modified
@@ -1742,10 +1808,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex), map.m_pblk);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
/*
* We may have split the extent tree while toggling the state.
* Try to shrink the extent tree now.
@@ -1757,6 +1821,7 @@ next:
}
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits);
+out:
iput(inode);
return 0;
}
@@ -1786,6 +1851,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
}
ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
inode->i_ino, le32_to_cpu(lrange.fc_lblk),
@@ -1795,10 +1862,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
map.m_len = remaining;
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0) {
- iput(inode);
- return 0;
- }
+ if (ret < 0)
+ goto out;
if (ret > 0) {
remaining -= ret;
cur += ret;
@@ -1809,16 +1874,18 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
}
}
- ret = ext4_punch_hole(inode,
- le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
- le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_lblk) +
+ le32_to_cpu(lrange.fc_len) - 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (ret)
- jbd_debug(1, "ext4_punch_hole returned %d", ret);
+ goto out;
ext4_ext_replay_shrink_inode(inode,
i_size_read(inode) >> sb->s_blocksize_bits);
ext4_mark_inode_dirty(NULL, inode);
+out:
iput(inode);
-
return 0;
}
@@ -1842,6 +1909,10 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
}
cur = 0;
end = EXT_MAX_BLOCKS;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
+ iput(inode);
+ continue;
+ }
while (cur < end) {
map.m_lblk = cur;
map.m_len = end - cur;
@@ -1885,8 +1956,8 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
if (state->fc_regions[i].ino == 0 ||
state->fc_regions[i].len == 0)
continue;
- if (blk >= state->fc_regions[i].pblk &&
- blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+ if (in_range(blk, state->fc_regions[i].pblk,
+ state->fc_regions[i].len))
return true;
}
return false;
@@ -1970,7 +2041,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
ret = ext4_fc_record_regions(sb,
le32_to_cpu(ext.fc_ino),
le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
- ext4_ext_get_actual_len(ex));
+ ext4_ext_get_actual_len(ex), 0);
if (ret < 0)
break;
ret = JBD2_FC_REPLAY_CONTINUE;
@@ -2166,7 +2237,7 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
stats->fc_num_commits, stats->fc_ineligible_commits,
stats->fc_numblks,
- div_u64(sbi->s_fc_avg_commit_time, 1000));
+ div_u64(stats->s_fc_avg_commit_time, 1000));
seq_puts(seq, "Ineligible reasons:\n");
for (i = 0; i < EXT4_FC_REASON_MAX; i++)
seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
@@ -2185,3 +2256,8 @@ int __init ext4_fc_init_dentry_cache(void)
return 0;
}
+
+void ext4_fc_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(ext4_fc_dentry_cachep);
+}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 937c381b4c85..1db12847a83b 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -55,13 +55,13 @@ struct ext4_fc_del_range {
struct ext4_fc_dentry_info {
__le32 fc_parent_ino;
__le32 fc_ino;
- __u8 fc_dname[0];
+ __u8 fc_dname[];
};
/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
struct ext4_fc_inode {
__le32 fc_ino;
- __u8 fc_raw_inode[0];
+ __u8 fc_raw_inode[];
};
/* Value structure for tag EXT4_FC_TAG_TAIL. */
@@ -71,21 +71,19 @@ struct ext4_fc_tail {
};
/*
- * Fast commit reason codes
+ * Fast commit status codes
+ */
+enum {
+ EXT4_FC_STATUS_OK = 0,
+ EXT4_FC_STATUS_INELIGIBLE,
+ EXT4_FC_STATUS_SKIPPED,
+ EXT4_FC_STATUS_FAILED,
+};
+
+/*
+ * Fast commit ineligiblity reasons:
*/
enum {
- /*
- * Commit status codes:
- */
- EXT4_FC_REASON_OK = 0,
- EXT4_FC_REASON_INELIGIBLE,
- EXT4_FC_REASON_ALREADY_COMMITTED,
- EXT4_FC_REASON_FC_START_FAILED,
- EXT4_FC_REASON_FC_FAILED,
-
- /*
- * Fast commit ineligiblity reasons:
- */
EXT4_FC_REASON_XATTR = 0,
EXT4_FC_REASON_CROSS_RENAME,
EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
@@ -95,7 +93,6 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
- EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX
};
@@ -111,13 +108,17 @@ struct ext4_fc_dentry_update {
struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list;
+ struct list_head fcd_dilist;
};
struct ext4_fc_stats {
unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
unsigned long fc_num_commits;
unsigned long fc_ineligible_commits;
+ unsigned long fc_failed_commits;
+ unsigned long fc_skipped_commits;
unsigned long fc_numblks;
+ u64 s_fc_avg_commit_time;
};
#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac0e11bbb445..6feb07e3e1eb 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,9 +36,11 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
@@ -61,7 +63,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -74,7 +76,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -259,19 +261,17 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
- ext4_fc_start_update(inode);
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
current->backing_dev_info = inode_to_bdi(inode);
- ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ ret = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
out:
inode_unlock(inode);
- ext4_fc_stop_update(inode);
if (likely(ret > 0)) {
iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
@@ -511,7 +511,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
@@ -552,9 +552,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- ext4_fc_start_update(inode);
ret = ext4_orphan_add(handle, inode);
- ext4_fc_stop_update(inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
@@ -566,7 +564,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
+ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
+ 0);
if (ret == -ENOTBLK)
ret = 0;
@@ -915,7 +914,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index f34f4176c1e7..147b5241dd94 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 89efa78ed4b2..07a8c75b65ed 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode, 0);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 39a1ab129fdc..9c076262770d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -7,7 +7,7 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/iversion.h>
-#include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -911,7 +911,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct page **pagep,
void **fsdata)
{
- int ret, inline_size;
+ int ret;
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
@@ -928,14 +928,9 @@ retry_journal:
goto out;
}
- inline_size = ext4_get_max_inline_size(inode);
-
- ret = -ENOSPC;
- if (inline_size >= pos + len) {
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
- }
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out_journal;
/*
* We cannot recurse into the filesystem as the transaction
@@ -1133,7 +1128,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
@@ -1780,19 +1783,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block",
err, dir->i_ino);
- return true;
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1801,7 +1805,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1820,16 +1823,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
@@ -1929,8 +1931,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
retry:
err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry;
}
if (err)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f06305167d5..646ece9b3455 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>
+#include <linux/dax.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -136,8 +137,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -185,7 +184,7 @@ void ext4_evict_inode(struct inode *inode)
* journal. So although mm thinks everything is clean and
* ready for reaping the inode might still have some pages to
* write in the running transaction or waiting to be
- * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * checkpointed. Thus calling jbd2_journal_invalidate_folio()
* (via truncate_inode_pages()) to discard these buffers can
* cause data loss. Also even if we did not discard these
* buffers, we would have no way to find them after the inode
@@ -337,7 +336,7 @@ stop_handle:
return;
no_delete:
if (!list_empty(&EXT4_I(inode)->i_fc_list))
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -741,10 +740,11 @@ out_sem:
if (ret)
return ret;
}
- ext4_fc_track_range(handle, inode, map->m_lblk,
- map->m_lblk + map->m_len - 1);
}
-
+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
+ map->m_flags & EXT4_MAP_MAPPED))
+ ext4_fc_track_range(handle, inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1);
if (retval < 0)
ext_debug(inode, "failed with err %d\n", retval);
return retval;
@@ -1222,7 +1222,7 @@ retry_journal:
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold i_rwsem.
*
* Add inode to orphan list in case we crash before
* truncate finishes
@@ -1569,16 +1569,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ struct folio *folio = page_folio(page);
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (page_mapped(page))
- clear_page_dirty_for_io(page);
- block_invalidatepage(page, 0, PAGE_SIZE);
- ClearPageUptodate(page);
+ if (folio_mapped(folio))
+ folio_clear_dirty_for_io(folio);
+ block_invalidate_folio(folio, 0,
+ folio_size(folio));
+ folio_clear_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
pagevec_release(&pvec);
}
@@ -1711,16 +1713,13 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
}
/*
- * the buffer head associated with a delayed and not unwritten
- * block found in the extent status cache must contain an
- * invalid block number and have its BH_New and BH_Delay bits
- * set, reflecting the state assigned when the block was
- * initially delayed allocated
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
*/
- if (ext4_es_is_delonly(&es)) {
- BUG_ON(bh->b_blocknr != invalid_block);
- BUG_ON(!buffer_new(bh));
- BUG_ON(!buffer_delay(bh));
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return 0;
}
@@ -1847,30 +1846,16 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-static int bget_one(handle_t *handle, struct inode *inode,
- struct buffer_head *bh)
-{
- get_bh(bh);
- return 0;
-}
-
-static int bput_one(handle_t *handle, struct inode *inode,
- struct buffer_head *bh)
-{
- put_bh(bh);
- return 0;
-}
-
static int __ext4_journalled_writepage(struct page *page,
unsigned int len)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
int ret = 0, err = 0;
int inline_data = ext4_has_inline_data(inode);
struct buffer_head *inode_bh = NULL;
+ loff_t size;
ClearPageChecked(page);
@@ -1880,14 +1865,6 @@ static int __ext4_journalled_writepage(struct page *page,
inode_bh = ext4_journalled_write_inline_data(inode, len, page);
if (inode_bh == NULL)
goto out;
- } else {
- page_bufs = page_buffers(page);
- if (!page_bufs) {
- BUG();
- goto out;
- }
- ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
- NULL, bget_one);
}
/*
* We need to release the page lock before we start the
@@ -1908,7 +1885,8 @@ static int __ext4_journalled_writepage(struct page *page,
lock_page(page);
put_page(page);
- if (page->mapping != mapping) {
+ size = i_size_read(inode);
+ if (page->mapping != mapping || page_offset(page) > size) {
/* The page got truncated from under us */
ext4_journal_stop(handle);
ret = 0;
@@ -1918,6 +1896,13 @@ static int __ext4_journalled_writepage(struct page *page,
if (inline_data) {
ret = ext4_mark_inode_dirty(handle, inode);
} else {
+ struct buffer_head *page_bufs = page_buffers(page);
+
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
+ else
+ len = PAGE_SIZE;
+
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, do_journal_get_write_access);
@@ -1938,9 +1923,6 @@ static int __ext4_journalled_writepage(struct page *page,
out:
unlock_page(page);
out_no_pagelock:
- if (!inline_data && page_bufs)
- ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len,
- NULL, bput_one);
brelse(inode_bh);
return ret;
}
@@ -1989,6 +1971,7 @@ out_no_pagelock:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
loff_t size;
unsigned int len;
@@ -1998,8 +1981,8 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return -EIO;
}
@@ -2011,6 +1994,15 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2260,7 +2252,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
io_end_vec->size += io_end_size;
- io_end_size = 0;
err = mpage_process_page_bufs(mpd, head, bh, lblk);
if (err > 0)
@@ -2285,7 +2276,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
} while (lblk++, (bh = bh->b_this_page) != head);
io_end_vec->size += io_end_size;
- io_end_size = 0;
*map_bh = false;
out:
*m_lblk = lblk;
@@ -2614,6 +2604,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -3202,40 +3208,39 @@ static void ext4_readahead(struct readahead_control *rac)
ext4_mpage_readpages(inode, rac, NULL);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ext4_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- trace_ext4_invalidatepage(page, offset, length);
+ trace_ext4_invalidate_folio(folio, offset, length);
/* No journalling happens on data buffers when this function is used */
- WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
- block_invalidatepage(page, offset, length);
+ block_invalidate_folio(folio, offset, length);
}
-static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static int __ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset, length);
+ trace_ext4_journalled_invalidate_folio(folio, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_SIZE)
- ClearPageChecked(page);
+ if (offset == 0 && length == folio_size(folio))
+ folio_clear_checked(folio);
- return jbd2_journal_invalidatepage(journal, page, offset, length);
+ return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}
/* Wrapper for aops... */
-static void ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset,
+ size_t length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+ WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3274,7 +3279,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
- loff_t length)
+ loff_t length, unsigned int flags)
{
u8 blkbits = inode->i_blkbits;
@@ -3291,8 +3296,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64) map->m_lblk << blkbits;
iomap->length = (u64) map->m_len << blkbits;
@@ -3312,9 +3319,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_UNWRITTEN) {
iomap->type = IOMAP_UNWRITTEN;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else if (map->m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3351,8 +3362,8 @@ retry:
* DAX and direct I/O are the only two operations that are currently
* supported with IOMAP_WRITE.
*/
- WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
- if (IS_DAX(inode))
+ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
+ if (flags & IOMAP_DAX)
m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
/*
* We use i_size instead of i_disksize here because delalloc writeback
@@ -3423,7 +3434,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
out:
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
}
@@ -3543,7 +3561,7 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
delalloc = ext4_iomap_is_delalloc(inode, &map);
set_iomap:
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
if (delalloc && iomap->type == IOMAP_HOLE)
iomap->type = IOMAP_DELALLOC;
@@ -3555,29 +3573,32 @@ const struct iomap_ops ext4_iomap_report_ops = {
};
/*
- * Pages can be marked dirty completely asynchronously from ext4's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
+ * Whenever the folio is being dirtied, corresponding buffers should already
+ * be attached to the transaction (we take care of this in ext4_page_mkwrite()
+ * and ext4_write_begin()). However we cannot move buffers to dirty transaction
+ * lists here because ->dirty_folio is called under VFS locks and the folio
+ * is not necessarily locked.
*
- * We cannot just dirty the page and leave attached buffers clean, because the
+ * We cannot just dirty the folio and leave attached buffers clean, because the
* buffers' dirty state is "definitive". We cannot just set the buffers dirty
* or jbddirty because all the journalling code will explode.
*
- * So what we do is to mark the page "pending dirty" and next time writepage
+ * So what we do is to mark the folio "pending dirty" and next time writepage
* is called, propagate that into the buffers appropriately.
*/
-static int ext4_journalled_set_page_dirty(struct page *page)
+static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ WARN_ON_ONCE(!folio_buffers(folio));
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
-static int ext4_set_page_dirty(struct page *page)
+static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
- WARN_ON_ONCE(!page_has_buffers(page));
- return __set_page_dirty_buffers(page);
+ WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
+ WARN_ON_ONCE(!folio_buffers(folio));
+ return block_dirty_folio(mapping, folio);
}
static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
@@ -3594,9 +3615,9 @@ static const struct address_space_operations ext4_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3612,9 +3633,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
+ .dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_journalled_invalidatepage,
+ .invalidate_folio = ext4_journalled_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3629,9 +3650,9 @@ static const struct address_space_operations ext4_da_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3643,9 +3664,8 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = noop_invalidatepage,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3783,8 +3803,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;
if (IS_DAX(inode)) {
- return iomap_zero_range(inode, from, length, NULL,
- &ext4_iomap_ops);
+ return dax_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -3933,12 +3953,14 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
@@ -3981,6 +4003,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -3993,9 +4023,13 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -4143,7 +4177,7 @@ int ext4_truncate(struct inode *inode)
/*
* There is a possibility that we're either freeing the inode
* or it's a completely new inode. In those cases we might not
- * have i_mutex locked because it's not necessary.
+ * have i_rwsem locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
WARN_ON(!inode_is_locked(inode));
@@ -4234,14 +4268,161 @@ out_trace:
return err;
}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ struct inode *inode = &(ei->vfs_inode);
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
+ struct super_block *sb = inode->i_sb;
+
+ if (i_blocks <= ~0U) {
+ /*
+ * i_blocks can be represented in a 32 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = 0;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+
+ /*
+ * This should never happen since sb->s_maxbytes should not have
+ * allowed this, sb->s_maxbytes was set according to the huge_file
+ * feature in ext4_fill_super().
+ */
+ if (!ext4_has_feature_huge_file(sb))
+ return -EFSCORRUPTED;
+
+ if (i_blocks <= 0xffffffffffffULL) {
+ /*
+ * i_blocks can be represented in a 48 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ } else {
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ /* i_block is stored in file system block size */
+ i_blocks = i_blocks >> (inode->i_blkbits - 9);
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ }
+ return 0;
+}
+
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ uid_t i_uid;
+ gid_t i_gid;
+ projid_t i_projid;
+ int block;
+ int err;
+
+ err = ext4_inode_blocks_set(raw_inode, ei);
+
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+ /*
+ * Fix up interoperability with old kernels. Otherwise,
+ * old inodes get re-used with the upper 16 bits of the
+ * uid/gid intact.
+ */
+ if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ } else {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(i_gid));
+ }
+ } else {
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+ ext4_isize_set(raw_inode, ei->i_disksize);
+
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
+
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ u64 ivers = ext4_inode_peek_iversion(inode);
+
+ raw_inode->i_disk_version = cpu_to_le32(ivers);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(ivers >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
+ }
+
+ if (i_projid != EXT4_DEF_PROJID &&
+ !ext4_has_feature_project(inode->i_sb))
+ err = err ?: -EFSCORRUPTED;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
+ ext4_inode_csum_set(inode, raw_inode, ei);
+ return err;
+}
+
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
+ * underlying buffer_head on success. If we pass 'inode' and it does not
+ * have in-inode xattr, we have all inode data in memory that is needed
+ * to recreate the on-disk version of this inode.
*/
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
- struct ext4_iloc *iloc, int in_mem,
+ struct inode *inode, struct ext4_iloc *iloc,
ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
@@ -4287,7 +4468,7 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
* is the only valid inode in the block, we need not read the
* block.
*/
- if (in_mem) {
+ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
struct buffer_head *bitmap_bh;
int i, start;
@@ -4315,8 +4496,13 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
}
brelse(bitmap_bh);
if (i == start + inodes_per_block) {
+ struct ext4_inode *raw_inode =
+ (struct ext4_inode *) (bh->b_data + iloc->offset);
+
/* all other inodes are free, so skip I/O */
memset(bh->b_data, 0, bh->b_size);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ ext4_fill_raw_inode(inode, raw_inode);
set_buffer_uptodate(bh);
unlock_buffer(bh);
goto has_buffer;
@@ -4374,10 +4560,10 @@ has_buffer:
static int __ext4_get_inode_loc_noinmem(struct inode *inode,
struct ext4_iloc *iloc)
{
- ext4_fsblk_t err_blk;
+ ext4_fsblk_t err_blk = 0;
int ret;
- ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
&err_blk);
if (ret == -EIO)
@@ -4389,12 +4575,11 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode,
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
- ext4_fsblk_t err_blk;
+ ext4_fsblk_t err_blk = 0;
int ret;
- /* We have all inode data except xattrs in memory here. */
- ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
+ &err_blk);
if (ret == -EIO)
ext4_error_inode_block(inode, err_blk, EIO,
@@ -4407,7 +4592,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc)
{
- return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
+ return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}
static bool ext4_should_enable_dax(struct inode *inode)
@@ -4528,13 +4713,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
else
inode_set_iversion_queried(inode, val);
}
-static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
-{
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return inode_peek_iversion_raw(inode);
- else
- return inode_peek_iversion(inode);
-}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
@@ -4855,51 +5033,6 @@ bad_inode:
return ERR_PTR(ret);
}
-static int ext4_inode_blocks_set(handle_t *handle,
- struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
-{
- struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = READ_ONCE(inode->i_blocks);
- struct super_block *sb = inode->i_sb;
-
- if (i_blocks <= ~0U) {
- /*
- * i_blocks can be represented in a 32 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = 0;
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- return 0;
- }
-
- /*
- * This should never happen since sb->s_maxbytes should not have
- * allowed this, sb->s_maxbytes was set according to the huge_file
- * feature in ext4_fill_super().
- */
- if (!ext4_has_feature_huge_file(sb))
- return -EFSCORRUPTED;
-
- if (i_blocks <= 0xffffffffffffULL) {
- /*
- * i_blocks can be represented in a 48 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- } else {
- ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- /* i_block is stored in file system block size */
- i_blocks = i_blocks >> (inode->i_blkbits - 9);
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- }
- return 0;
-}
-
static void __ext4_update_other_inode_time(struct super_block *sb,
unsigned long orig_ino,
unsigned long ino,
@@ -4975,11 +5108,8 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
- int err = 0, block;
+ int err;
int need_datasync = 0, set_large_file = 0;
- uid_t i_uid;
- gid_t i_gid;
- projid_t i_projid;
spin_lock(&ei->i_raw_lock);
@@ -4990,97 +5120,15 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- err = ext4_inode_blocks_set(handle, raw_inode, ei);
-
- raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- i_uid = i_uid_read(inode);
- i_gid = i_gid_read(inode);
- i_projid = from_kprojid(&init_user_ns, ei->i_projid);
- if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
- /*
- * Fix up interoperability with old kernels. Otherwise,
- * old inodes get re-used with the upper 16 bits of the
- * uid/gid intact.
- */
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- } else {
- raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(i_uid));
- raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(i_gid));
- }
- } else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
- EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-
- raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
need_datasync = 1;
- }
if (ei->i_disksize > 0x7fffffffULL) {
if (!ext4_has_feature_large_file(sb) ||
- EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV))
+ EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
}
- raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- raw_inode->i_block[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- raw_inode->i_block[1] = 0;
- } else {
- raw_inode->i_block[0] = 0;
- raw_inode->i_block[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- raw_inode->i_block[2] = 0;
- }
- } else if (!ext4_has_inline_data(inode)) {
- for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
- }
-
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = ext4_inode_peek_iversion(inode);
-
- raw_inode->i_disk_version = cpu_to_le32(ivers);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(ivers >> 32);
- raw_inode->i_extra_isize =
- cpu_to_le16(ei->i_extra_isize);
- }
- }
- if (i_projid != EXT4_DEF_PROJID &&
- !ext4_has_feature_project(inode->i_sb))
- err = err ?: -EFSCORRUPTED;
-
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
- raw_inode->i_projid = cpu_to_le32(i_projid);
-
- ext4_inode_csum_set(inode, raw_inode, ei);
+ err = ext4_fill_raw_inode(inode, raw_inode);
spin_unlock(&ei->i_raw_lock);
if (err) {
EXT4_ERROR_INODE(inode, "corrupted inode contents");
@@ -5204,13 +5252,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
- * buffers that are attached to a page stradding i_size and are undergoing
+ * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
+ * buffers that are attached to a folio straddling i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
- struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0;
@@ -5218,25 +5265,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * If the page is fully truncated, we don't need to wait for any commit
- * (and we even should not as __ext4_journalled_invalidatepage() may
- * strip all buffers from the page but keep the page dirty which can then
- * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * If the folio is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidate_folio() may
+ * strip all buffers from the folio but keep the folio dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
- * the page remain valid. This is most beneficial for the common case of
+ * the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
*/
if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
- page = find_lock_page(inode->i_mapping,
+ struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!page)
+ if (!folio)
return;
- ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_SIZE - offset);
- unlock_page(page);
- put_page(page);
+ ret = __ext4_journalled_invalidate_folio(folio, offset,
+ folio_size(folio) - offset);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret != -EBUSY)
return;
commit_tid = 0;
@@ -5271,7 +5318,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
* transaction are already on disk (truncate waits for pages under
* writeback).
*
- * Called with inode->i_mutex down.
+ * Called with inode->i_rwsem down.
*/
int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
@@ -5309,7 +5356,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
}
- ext4_fc_start_update(inode);
+
if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
@@ -5333,7 +5380,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error) {
ext4_journal_stop(handle);
- ext4_fc_stop_update(inode);
return error;
}
/* Update corresponding info in inode so that everything is in
@@ -5345,7 +5391,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
if (unlikely(error)) {
- ext4_fc_stop_update(inode);
return error;
}
}
@@ -5359,12 +5404,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes) {
- ext4_fc_stop_update(inode);
return -EFBIG;
}
}
if (!S_ISREG(inode->i_mode)) {
- ext4_fc_stop_update(inode);
return -EINVAL;
}
@@ -5416,8 +5459,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ext4_fc_track_range(handle, inode,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
inode->i_sb->s_blocksize_bits,
- (oldsize > 0 ? oldsize - 1 : 0) >>
- inode->i_sb->s_blocksize_bits);
+ EXT_MAX_BLOCKS - 1);
else
ext4_fc_track_range(
handle, inode,
@@ -5488,7 +5530,6 @@ err_out:
ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
- ext4_fc_stop_update(inode);
return error;
}
@@ -5989,7 +6030,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return PTR_ERR(handle);
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 606dee9e08a3..ba44fa1be70a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -27,7 +27,249 @@
#include "fsmap.h"
#include <trace/events/ext4.h>
-/**
+typedef void ext4_update_sb_callback(struct ext4_super_block *es,
+ const void *arg);
+
+/*
+ * Superblock modification callback function for changing file system
+ * label
+ */
+static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
+{
+ /* Sanity check, this should never happen */
+ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
+
+ memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
+}
+
+static
+int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh = sbi->s_sbh;
+ struct ext4_super_block *es = sbi->s_es;
+
+ trace_ext4_update_sb(sb, bh->b_blocknr, 1);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ lock_buffer(bh);
+ func(es, arg);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(bh);
+
+ if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_err;
+ err = sync_dirty_buffer(bh);
+out_err:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/*
+ * Update one backup superblock in the group 'grp' using the callback
+ * function 'func' and argument 'arg'. If the handle is NULL the
+ * modification is not journalled.
+ *
+ * Returns: 0 when no modification was done (no superblock in the group)
+ * 1 when the modification was successful
+ * <0 on error
+ */
+static int ext4_update_backup_sb(struct super_block *sb,
+ handle_t *handle, ext4_group_t grp,
+ ext4_update_sb_callback func, const void *arg)
+{
+ int err = 0;
+ ext4_fsblk_t sb_block;
+ struct buffer_head *bh;
+ unsigned long offset = 0;
+ struct ext4_super_block *es;
+
+ if (!ext4_bg_has_super(sb, grp))
+ return 0;
+
+ /*
+ * For the group 0 there is always 1k padding, so we have
+ * either adjust offset, or sb_block depending on blocksize
+ */
+ if (grp == 0) {
+ sb_block = 1 * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(sb_block, sb->s_blocksize);
+ } else {
+ sb_block = ext4_group_first_block_no(sb, grp);
+ offset = 0;
+ }
+
+ trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0);
+
+ bh = ext4_sb_bread(sb, sb_block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ if (handle) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_bh;
+ }
+
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ lock_buffer(bh);
+ if (ext4_has_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+ "superblock %llu\n", sb_block);
+ unlock_buffer(bh);
+ err = -EFSBADCRC;
+ goto out_bh;
+ }
+ func(es, arg);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if (err)
+ goto out_bh;
+
+ if (handle) {
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_bh;
+ } else {
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ }
+ err = sync_dirty_buffer(bh);
+
+out_bh:
+ brelse(bh);
+ ext4_std_error(sb, err);
+ return (err) ? err : 1;
+}
+
+/*
+ * Update primary and backup superblocks using the provided function
+ * func and argument arg.
+ *
+ * Only the primary superblock and at most two backup superblock
+ * modifications are journalled; the rest is modified without journal.
+ * This is safe because e2fsck will re-write them if there is a problem,
+ * and we're very unlikely to ever need more than two backups.
+ */
+static
+int ext4_update_superblocks_fn(struct super_block *sb,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ handle_t *handle;
+ ext4_group_t ngroups;
+ unsigned int three = 1;
+ unsigned int five = 5;
+ unsigned int seven = 7;
+ int err = 0, ret, i;
+ ext4_group_t grp, primary_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * We can't update superblocks while the online resize is running
+ */
+ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+ &sbi->s_ext4_flags)) {
+ ext4_msg(sb, KERN_ERR, "Can't modify superblock while"
+ "performing online resize");
+ return -EBUSY;
+ }
+
+ /*
+ * We're only going to update primary superblock and two
+ * backup superblocks in this transaction.
+ */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
+ }
+
+ /* Update primary superblock */
+ err = ext4_update_primary_sb(sb, handle, func, arg);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "Failed to update primary "
+ "superblock");
+ goto out_journal;
+ }
+
+ primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr);
+ ngroups = ext4_get_groups_count(sb);
+
+ /*
+ * Update backup superblocks. We have to start from group 0
+ * because it might not be where the primary superblock is
+ * if the fs is mounted with -o sb=<backup_sb_block>
+ */
+ i = 0;
+ grp = 0;
+ while (grp < ngroups) {
+ /* Skip primary superblock */
+ if (grp == primary_grp)
+ goto next_grp;
+
+ ret = ext4_update_backup_sb(sb, handle, grp, func, arg);
+ if (ret < 0) {
+ /* Ignore bad checksum; try to update next sb */
+ if (ret == -EFSBADCRC)
+ goto next_grp;
+ err = ret;
+ goto out_journal;
+ }
+
+ i += ret;
+ if (handle && i > 1) {
+ /*
+ * We're only journalling primary superblock and
+ * two backup superblocks; the rest is not
+ * journalled.
+ */
+ err = ext4_journal_stop(handle);
+ if (err)
+ goto out;
+ handle = NULL;
+ }
+next_grp:
+ grp = ext4_list_backups(sb, &three, &five, &seven);
+ }
+
+out_journal:
+ if (handle) {
+ ret = ext4_journal_stop(handle);
+ if (ret && !err)
+ err = ret;
+ }
+out:
+ clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags);
+ smp_mb__after_atomic();
+ return err ? err : 0;
+}
+
+/*
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
@@ -48,7 +290,7 @@ static void memswap(void *a, void *b, size_t len)
}
}
-/**
+/*
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
@@ -102,7 +344,7 @@ void ext4_reset_inode_seed(struct inode *inode)
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
-/**
+/*
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
@@ -169,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = -EINVAL;
goto err_out;
}
- ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
@@ -252,7 +494,6 @@ revert:
err_out1:
ext4_journal_stop(handle);
- ext4_fc_stop_ineligible(sb);
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
@@ -743,7 +984,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
u32 flags = fa->flags;
int err = -EOPNOTSUPP;
- ext4_fc_start_update(inode);
if (flags & ~EXT4_FL_USER_VISIBLE)
goto out;
@@ -764,7 +1004,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
goto out;
err = ext4_ioctl_setproject(inode, fa->fsx_projid);
out:
- ext4_fc_stop_update(inode);
return err;
}
@@ -850,6 +1089,64 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
return err;
}
+static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label)
+{
+ size_t len;
+ int ret = 0;
+ char new_label[EXT4_LABEL_MAX + 1];
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Copy the maximum length allowed for ext4 label with one more to
+ * find the required terminating null byte in order to test the
+ * label length. The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1))
+ return -EFAULT;
+
+ len = strnlen(new_label, EXT4_LABEL_MAX + 1);
+ if (len > EXT4_LABEL_MAX)
+ return -EINVAL;
+
+ /*
+ * Clear the buffer after the new label
+ */
+ memset(new_label + len, 0, EXT4_LABEL_MAX - len);
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label)
+{
+ char label[EXT4_LABEL_MAX + 1];
+
+ /*
+ * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because
+ * FSLABEL_MAX must include terminating null byte, while s_volume_name
+ * does not have to.
+ */
+ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
+
+ memset(label, 0, sizeof(label));
+ lock_buffer(sbi->s_sbh);
+ strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(user_label, label, sizeof(label)))
+ return -EFAULT;
+ return 0;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1076,7 +1373,7 @@ mext_out:
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
@@ -1117,8 +1414,6 @@ resizefs_out:
sizeof(range)))
return -EFAULT;
- range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range);
if (ret < 0)
return ret;
@@ -1266,6 +1561,13 @@ resizefs_out:
case EXT4_IOC_CHECKPOINT:
return ext4_ioctl_checkpoint(filp, arg);
+ case FS_IOC_GETFSLABEL:
+ return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg);
+
+ case FS_IOC_SETFSLABEL:
+ return ext4_ioctl_setlabel(filp,
+ (const void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1273,13 +1575,7 @@ resizefs_out:
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- long ret;
-
- ext4_fc_start_update(file_inode(filp));
- ret = __ext4_ioctl(filp, cmd, arg);
- ext4_fc_stop_update(file_inode(filp));
-
- return ret;
+ return __ext4_ioctl(filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
@@ -1347,6 +1643,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
case EXT4_IOC_CHECKPOINT:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
break;
default:
return -ENOIOCTLCMD;
@@ -1354,3 +1652,19 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_rdonly(sb) || sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))
+ return 0;
+
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 72bfac2d6dce..252c168454c7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1000,7 +1000,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= 2)
return 0;
- if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
return 1;
}
@@ -1689,7 +1689,7 @@ static int mb_test_and_clear_bits(void *bm, int cur, int len)
return zero_bit;
}
-void ext4_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1996,7 +1996,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -2834,7 +2834,7 @@ out:
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
@@ -2845,7 +2845,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
++*pos;
@@ -2857,7 +2857,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
int err, buddy_loaded = 0;
@@ -2985,7 +2985,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position;
read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
@@ -2998,7 +2998,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position;
++*pos;
@@ -3010,7 +3010,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof
static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp;
@@ -3058,7 +3058,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
__releases(&EXT4_SB(sb)->s_mb_rb_lock)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
}
@@ -3825,7 +3825,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3844,7 +3844,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
@@ -3899,69 +3899,103 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, clen, err;
+ int i, err;
int already;
+ unsigned int clen, clen_changed, thisgrp_len;
- clen = EXT4_B2C(sbi, len);
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
- }
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
+ ext4_error(sb, "Marking blocks in system zone - "
+ "Block = %llu, len = %u",
+ block, thisgrp_len);
+ bitmap_bh = NULL;
+ break;
+ }
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto out_err;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
- already++;
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
- if (state)
- ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- group, gdp));
- }
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen + already;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen - already;
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
- ext4_unlock_group(sb, group);
+ ext4_unlock_group(sb, group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
- atomic64_sub(len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
}
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
- goto out_err;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
+ brelse(bitmap_bh);
}
/*
@@ -4433,7 +4467,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, efd_node);
- ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -4474,7 +4508,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- ext4_set_bits(bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
}
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
@@ -4814,7 +4848,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
@@ -4822,8 +4856,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct list_head list;
struct ext4_buddy e4b;
int err;
- int busy = 0;
- int free, free_total = 0;
+ int free = 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
@@ -4846,19 +4879,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
INIT_LIST_HEAD(&list);
-repeat:
- free = 0;
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -4898,22 +4926,13 @@ repeat:
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
- free_total += free;
-
- /* if we still need more blocks and some PAs were used, try again */
- if (free_total < needed && busy) {
- ext4_unlock_group(sb, group);
- cond_resched();
- busy = 0;
- goto repeat;
- }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
out_dbg:
mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
- free_total, group, grp->bb_free);
- return free_total;
+ free, group, grp->bb_free);
+ return free;
}
/*
@@ -5455,13 +5474,24 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;
@@ -5757,7 +5787,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct super_block *sb = ar->inode->i_sb;
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i = sb->s_blocksize;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
ext4_fsblk_t goal, block;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -5779,19 +5810,26 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
ext4_get_group_no_and_offset(sb,
max(ext4_group_first_block_no(sb, group), goal),
NULL, &blkoff);
- i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize,
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i)) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
brelse(bitmap_bh);
- if (i >= sb->s_blocksize)
- continue;
- if (ext4_fc_replay_check_excluded(sb,
- ext4_group_first_block_no(sb, group) + i))
- continue;
- break;
+ if (i < max)
+ break;
}
- if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize)
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
+ *errp = -ENOSPC;
return 0;
+ }
block = ext4_group_first_block_no(sb, group) + i;
ext4_mb_mark_bb(sb, block, 1, 1);
@@ -5842,17 +5880,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @bh: optional buffer of the block to be freed
* @block: starting physical block to be freed
* @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
@@ -5869,80 +5907,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
- ext4_free_blocks_simple(inode, block, count);
- return;
- }
-
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
-
- if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_inode_block_valid(inode, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
-
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
- }
-
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5973,13 +5937,7 @@ do_more:
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
-
+ if (!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -6050,7 +6008,7 @@ do_more:
NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
} else
@@ -6111,6 +6069,103 @@ error_return:
}
/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
+ might_sleep();
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
+ return;
+}
+
+/**
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
@@ -6166,11 +6221,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
@@ -6299,7 +6350,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
void *bitmap;
- int ret = 0;
bitmap = e4b->bd_bitmap;
start = (e4b->bd_info->bb_first_free > start) ?
@@ -6314,10 +6364,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start, next - start, e4b);
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
if (ret && ret != -EOPNOTSUPP)
break;
- ret = 0;
count += next - start;
}
free_count += next - start;
@@ -6374,7 +6424,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_lock_group(sb, group);
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
- minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) {
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
if (ret >= 0)
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
@@ -6405,6 +6455,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -6423,6 +6474,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start >= max_blks ||
range->len < sb->s_blocksize)
return -EINVAL;
+ /* No point to try to trim less than discard granularity */
+ if (range->minlen < q->limits.discard_granularity) {
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ q->limits.discard_granularity >> sb->s_blocksize_bits);
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
+ goto out;
+ }
if (end >= max_blks)
end = max_blks - 1;
if (end <= first_data_blk)
@@ -6475,7 +6533,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
if (!ret)
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 7e0b4f81c6c0..7a5353a8cfd7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -437,12 +437,12 @@ int ext4_ext_migrate(struct inode *inode)
percpu_down_write(&sbi->s_writepages_rwsem);
/*
- * Worst case we can touch the allocation bitmaps, a bgd
- * block, and a block to link in the orphan list. We do need
- * need to worry about credits for modifying the quota inode.
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need need to worry about
+ * credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
- 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -459,6 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
ext4_journal_stop(handle);
goto out_unlock;
}
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
@@ -467,7 +474,6 @@ int ext4_ext_migrate(struct inode *inode)
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
- ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
@@ -479,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block
+ * Even though we take i_rwsem we can still cause block
* allocation via mmap write to holes. If we have allocated
* new blocks we fail migrate. New block allocation will
* clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
@@ -492,17 +498,10 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
- /*
- * It is impossible to update on-disk structures without
- * a handle, so just rollback in-core changes and live other
- * work to orphan_list_cleanup()
- */
- ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out_tmp_inode;
}
- ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 64a579734f93..95aa212f0863 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -632,7 +632,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
if (next_blk == EXT_MAX_BLOCKS) {
- o_start = o_end;
ret = -ENODATA;
goto out;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da7698341d7d..767b4bfe39c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for. If quick is set, assume the name being looked up
@@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent,
f.crypto_buf = fname->crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
@@ -1439,7 +1439,7 @@ static bool ext4_match(struct inode *parent,
fname->hinfo.minor_hash !=
EXT4_DIRENT_MINOR_HASH(de)) {
- return 0;
+ return false;
}
}
return !ext4_ci_compare(parent, &cf, de->name,
@@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
@@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
@@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
@@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode)
if (inode->i_size < ext4_dir_rec_len(1, NULL) +
ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
@@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
@@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode)
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
@@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
if (!retval)
ext4_fc_track_unlink(handle, dentry);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3889,14 +3889,21 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
* dirents in directories.
*/
ext4_fc_mark_ineligible(old.inode->i_sb,
- EXT4_FC_REASON_RENAME_DIR);
+ EXT4_FC_REASON_RENAME_DIR, handle);
} else {
+ struct super_block *sb = old.inode->i_sb;
+
if (new.inode)
ext4_fc_track_unlink(handle, new.dentry);
- __ext4_fc_track_link(handle, old.inode, new.dentry);
- __ext4_fc_track_unlink(handle, old.inode, old.dentry);
- if (whiteout)
- __ext4_fc_track_create(handle, whiteout, old.dentry);
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout,
+ old.dentry);
+ }
}
if (new.inode) {
@@ -4049,7 +4056,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(retval))
goto end_rename;
ext4_fc_mark_ineligible(new.inode->i_sb,
- EXT4_FC_REASON_CROSS_RENAME);
+ EXT4_FC_REASON_CROSS_RENAME, handle);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 53adc8f570a3..7de0612eb42d 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -93,7 +93,7 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
*
- * Orphan list manipulation functions must be called under i_mutex unless
+ * Orphan list manipulation functions must be called under i_rwsem unless
* we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
@@ -119,7 +119,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/*
* Orphan handling is only valid for files with data blocks
* being truncated, or files being unlinked. Note that we either
- * hold i_mutex, or the inode can not be referenced from outside,
+ * hold i_rwsem, or the inode can not be referenced from outside,
* so i_nlink should not be bumped due to race
*/
ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index f038d578d8d8..14695e2b5042 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
@@ -279,14 +281,14 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
INIT_LIST_HEAD(&io_end->list_vec);
- atomic_set(&io_end->count, 1);
+ refcount_set(&io_end->count, 1);
}
return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
@@ -300,7 +302,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_io_end_vec(io_end->handle,
io_end);
@@ -314,7 +316,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
- atomic_inc(&io_end->count);
+ refcount_inc(&io_end->count);
return io_end;
}
@@ -323,10 +325,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -372,10 +373,8 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0;
- io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
- bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -398,10 +397,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
@@ -421,10 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL) {
+ if (io->io_bio == NULL)
io_submit_init_bio(io, bh);
- io->io_bio->bi_write_hint = inode->i_write_hint;
- }
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
@@ -523,12 +519,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ret = PTR_ERR(bounce_page);
if (ret == -ENOMEM &&
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
- gfp_flags = GFP_NOFS;
+ gfp_t new_gfp_flags = GFP_NOFS;
if (io->io_bio)
ext4_io_submit(io);
else
- gfp_flags |= __GFP_NOFAIL;
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ new_gfp_flags |= __GFP_NOFAIL;
+ memalloc_retry_wait(gfp_flags);
+ gfp_flags = new_gfp_flags;
goto retry_encrypt;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3db923403505..af491e170c4a 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -43,7 +43,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/cleancache.h>
#include "ext4.h"
@@ -110,7 +109,7 @@ static void verity_work(struct work_struct *work)
struct bio *bio = ctx->bio;
/*
- * fsverity_verify_bio() may call readpages() again, and although verity
+ * fsverity_verify_bio() may call readahead() again, and although verity
* will be disabled for that, decryption may still be needed, causing
* another bio_post_read_ctx to be allocated. So to guarantee that
* mempool_alloc() never deadlocks we must free the current ctx first.
@@ -350,11 +349,6 @@ int ext4_mpage_readpages(struct inode *inode,
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
- if (fully_mapped && blocks_per_page == 1 &&
- !PageUptodate(page) && cleancache_get_page(page) == 0) {
- SetPageUptodate(page);
- goto confused;
- }
/*
* This page will go to BIO. Do we need to send this
@@ -371,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- rac ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b63cb88ccdae..90a941d20dff 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include "ext4_jbd2.h"
@@ -483,7 +484,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
}
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
- ext4_set_bits(bh->b_data, first_cluster - start, count2);
+ mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -632,7 +633,7 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0,
+ mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead));
}
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
@@ -717,12 +718,23 @@ out:
* sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
* For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
*/
-static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
- unsigned *five, unsigned *seven)
+unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three,
+ unsigned int *five, unsigned int *seven)
{
- unsigned *min = three;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ unsigned int *min = three;
int mult = 3;
- unsigned ret;
+ unsigned int ret;
+
+ if (ext4_has_feature_sparse_super2(sb)) {
+ do {
+ if (*min > 2)
+ return UINT_MAX;
+ ret = le32_to_cpu(es->s_backup_bgs[*min - 1]);
+ *min += 1;
+ } while (!ret);
+ return ret;
+ }
if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
@@ -2089,7 +2101,7 @@ retry:
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- if (jiffies - last_update_time > HZ * 10) {
+ if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
"resized to %llu blocks",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..1466fbdbc8e3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,13 +39,15 @@
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
-#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "ext4.h"
#include "ext4_extents.h" /* Needed for trace points definition */
@@ -72,12 +74,9 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
static int ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static void ext4_destroy_lazyinit_thread(void);
@@ -85,6 +84,16 @@ static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
unsigned int journal_inum);
+static int ext4_validate_options(struct fs_context *fc);
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb);
+static int ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int ext4_get_tree(struct fs_context *fc);
+static int ext4_reconfigure(struct fs_context *fc);
+static void ext4_fc_free(struct fs_context *fc);
+static int ext4_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_spec ext4_param_specs[];
/*
* Lock ordering
@@ -112,13 +121,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* transaction start -> page lock(s) -> i_data_sem (rw)
*/
+static const struct fs_context_operations ext4_context_ops = {
+ .parse_param = ext4_parse_param,
+ .get_tree = ext4_get_tree,
+ .reconfigure = ext4_reconfigure,
+ .free = ext4_fc_free,
+};
+
+
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
@@ -129,11 +147,12 @@ MODULE_ALIAS("ext2");
static struct file_system_type ext3_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext3",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext3",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
@@ -259,8 +278,8 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-static __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
@@ -759,6 +778,8 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}
@@ -789,6 +810,8 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
function, line);
}
@@ -827,6 +850,8 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
function, line);
}
@@ -894,6 +919,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
}
+ fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}
@@ -904,14 +930,20 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
- atomic_inc(&EXT4_SB(sb)->s_msg_count);
- if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
- return;
+ if (sb) {
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
+ "EXT4-fs"))
+ return;
+ }
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ if (sb)
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ else
+ printk("%sEXT4-fs: %pV\n", prefix, &vaf);
va_end(args);
}
@@ -1167,20 +1199,25 @@ static void ext4_put_super(struct super_block *sb)
int aborted = 0;
int i, err;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
-
- flush_work(&sbi->s_error_work);
- destroy_workqueue(sbi->rsv_conversion_wq);
- ext4_release_orphan_info(sb);
-
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ ext4_unregister_li_request(sb);
+ ext4_quota_off_umount(sb);
+
+ flush_work(&sbi->s_error_work);
+ destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
@@ -1269,7 +1306,7 @@ static void ext4_put_super(struct super_block *sb)
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
@@ -1284,7 +1321,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
{
struct ext4_inode_info *ei;
- ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -1572,7 +1609,6 @@ static const struct fscrypt_operations ext4_cryptops = {
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
.get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
@@ -1640,7 +1676,6 @@ static const struct super_operations ext4_sops = {
.freeze_fs = ext4_freeze,
.unfreeze_fs = ext4_unfreeze,
.statfs = ext4_statfs,
- .remount_fs = ext4_remount,
.show_options = ext4_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
@@ -1658,7 +1693,7 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
@@ -1667,152 +1702,169 @@ enum {
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
Opt_inlinecrypt,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
- Opt_nowarn_on_error, Opt_mblk_io_submit,
- Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
+ Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+ Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
};
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_debug, "debug"},
- {Opt_removed, "oldalloc"},
- {Opt_removed, "orlov"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_noload, "norecovery"},
- {Opt_noload, "noload"},
- {Opt_removed, "nobh"},
- {Opt_removed, "bh"},
- {Opt_commit, "commit=%u"},
- {Opt_min_batch_time, "min_batch_time=%u"},
- {Opt_max_batch_time, "max_batch_time=%u"},
- {Opt_journal_dev, "journal_dev=%u"},
- {Opt_journal_path, "journal_path=%s"},
- {Opt_journal_checksum, "journal_checksum"},
- {Opt_nojournal_checksum, "nojournal_checksum"},
- {Opt_journal_async_commit, "journal_async_commit"},
- {Opt_abort, "abort"},
- {Opt_data_journal, "data=journal"},
- {Opt_data_ordered, "data=ordered"},
- {Opt_data_writeback, "data=writeback"},
- {Opt_data_err_abort, "data_err=abort"},
- {Opt_data_err_ignore, "data_err=ignore"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_grpquota, "grpquota"},
- {Opt_noquota, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_prjquota, "prjquota"},
- {Opt_barrier, "barrier=%u"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_i_version, "i_version"},
- {Opt_dax, "dax"},
- {Opt_dax_always, "dax=always"},
- {Opt_dax_inode, "dax=inode"},
- {Opt_dax_never, "dax=never"},
- {Opt_stripe, "stripe=%u"},
- {Opt_delalloc, "delalloc"},
- {Opt_warn_on_error, "warn_on_error"},
- {Opt_nowarn_on_error, "nowarn_on_error"},
- {Opt_lazytime, "lazytime"},
- {Opt_nolazytime, "nolazytime"},
- {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
- {Opt_nodelalloc, "nodelalloc"},
- {Opt_removed, "mblk_io_submit"},
- {Opt_removed, "nomblk_io_submit"},
- {Opt_block_validity, "block_validity"},
- {Opt_noblock_validity, "noblock_validity"},
- {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
- {Opt_journal_ioprio, "journal_ioprio=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc"},
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
- {Opt_dioread_nolock, "dioread_nolock"},
- {Opt_dioread_lock, "nodioread_nolock"},
- {Opt_dioread_lock, "dioread_lock"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_init_itable, "init_itable=%u"},
- {Opt_init_itable, "init_itable"},
- {Opt_noinit_itable, "noinit_itable"},
-#ifdef CONFIG_EXT4_DEBUG
- {Opt_fc_debug_force, "fc_debug_force"},
- {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
-#endif
- {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
- {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption"},
- {Opt_inlinecrypt, "inlinecrypt"},
- {Opt_nombcache, "nombcache"},
- {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
- {Opt_removed, "prefetch_block_bitmaps"},
- {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
- {Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
- {Opt_removed, "check=none"}, /* mount option from ext2/3 */
- {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
- {Opt_removed, "reservation"}, /* mount option from ext2/3 */
- {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
- {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
- {Opt_err, NULL},
+static const struct constant_table ext4_param_errors[] = {
+ {"continue", EXT4_MOUNT_ERRORS_CONT},
+ {"panic", EXT4_MOUNT_ERRORS_PANIC},
+ {"remount-ro", EXT4_MOUNT_ERRORS_RO},
+ {}
};
-static ext4_fsblk_t get_sb_block(void **data)
-{
- ext4_fsblk_t sb_block;
- char *options = (char *) *data;
+static const struct constant_table ext4_param_data[] = {
+ {"journal", EXT4_MOUNT_JOURNAL_DATA},
+ {"ordered", EXT4_MOUNT_ORDERED_DATA},
+ {"writeback", EXT4_MOUNT_WRITEBACK_DATA},
+ {}
+};
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
+static const struct constant_table ext4_param_data_err[] = {
+ {"abort", Opt_data_err_abort},
+ {"ignore", Opt_data_err_ignore},
+ {}
+};
- options += 3;
- /* TODO: use simple_strtoll with >32bit ext4 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
+static const struct constant_table ext4_param_jqfmt[] = {
+ {"vfsold", QFMT_VFS_OLD},
+ {"vfsv0", QFMT_VFS_V0},
+ {"vfsv1", QFMT_VFS_V1},
+ {}
+};
- return sb_block;
-}
+static const struct constant_table ext4_param_dax[] = {
+ {"always", Opt_dax_always},
+ {"inode", Opt_dax_inode},
+ {"never", Opt_dax_never},
+ {}
+};
+
+/* String parameter that allows empty argument */
+#define fsparam_string_empty(NAME, OPT) \
+ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
+
+/*
+ * Mount option specification
+ * We don't use fsparam_flag_no because of the way we set the
+ * options and the way we show them in _ext4_show_options(). To
+ * keep the changes to a minimum, let's keep the negative options
+ * separate for now.
+ */
+static const struct fs_parameter_spec ext4_param_specs[] = {
+ fsparam_flag ("bsddf", Opt_bsd_df),
+ fsparam_flag ("minixdf", Opt_minix_df),
+ fsparam_flag ("grpid", Opt_grpid),
+ fsparam_flag ("bsdgroups", Opt_grpid),
+ fsparam_flag ("nogrpid", Opt_nogrpid),
+ fsparam_flag ("sysvgroups", Opt_nogrpid),
+ fsparam_u32 ("resgid", Opt_resgid),
+ fsparam_u32 ("resuid", Opt_resuid),
+ fsparam_u32 ("sb", Opt_sb),
+ fsparam_enum ("errors", Opt_errors, ext4_param_errors),
+ fsparam_flag ("nouid32", Opt_nouid32),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("oldalloc", Opt_removed),
+ fsparam_flag ("orlov", Opt_removed),
+ fsparam_flag ("user_xattr", Opt_user_xattr),
+ fsparam_flag ("nouser_xattr", Opt_nouser_xattr),
+ fsparam_flag ("acl", Opt_acl),
+ fsparam_flag ("noacl", Opt_noacl),
+ fsparam_flag ("norecovery", Opt_noload),
+ fsparam_flag ("noload", Opt_noload),
+ fsparam_flag ("bh", Opt_removed),
+ fsparam_flag ("nobh", Opt_removed),
+ fsparam_u32 ("commit", Opt_commit),
+ fsparam_u32 ("min_batch_time", Opt_min_batch_time),
+ fsparam_u32 ("max_batch_time", Opt_max_batch_time),
+ fsparam_u32 ("journal_dev", Opt_journal_dev),
+ fsparam_bdev ("journal_path", Opt_journal_path),
+ fsparam_flag ("journal_checksum", Opt_journal_checksum),
+ fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum),
+ fsparam_flag ("journal_async_commit",Opt_journal_async_commit),
+ fsparam_flag ("abort", Opt_abort),
+ fsparam_enum ("data", Opt_data, ext4_param_data),
+ fsparam_enum ("data_err", Opt_data_err,
+ ext4_param_data_err),
+ fsparam_string_empty
+ ("usrjquota", Opt_usrjquota),
+ fsparam_string_empty
+ ("grpjquota", Opt_grpjquota),
+ fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_noquota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("prjquota", Opt_prjquota),
+ fsparam_flag ("barrier", Opt_barrier),
+ fsparam_u32 ("barrier", Opt_barrier),
+ fsparam_flag ("nobarrier", Opt_nobarrier),
+ fsparam_flag ("i_version", Opt_i_version),
+ fsparam_flag ("dax", Opt_dax),
+ fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
+ fsparam_u32 ("stripe", Opt_stripe),
+ fsparam_flag ("delalloc", Opt_delalloc),
+ fsparam_flag ("nodelalloc", Opt_nodelalloc),
+ fsparam_flag ("warn_on_error", Opt_warn_on_error),
+ fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error),
+ fsparam_u32 ("debug_want_extra_isize",
+ Opt_debug_want_extra_isize),
+ fsparam_flag ("mblk_io_submit", Opt_removed),
+ fsparam_flag ("nomblk_io_submit", Opt_removed),
+ fsparam_flag ("block_validity", Opt_block_validity),
+ fsparam_flag ("noblock_validity", Opt_noblock_validity),
+ fsparam_u32 ("inode_readahead_blks",
+ Opt_inode_readahead_blks),
+ fsparam_u32 ("journal_ioprio", Opt_journal_ioprio),
+ fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc),
+ fsparam_flag ("dioread_nolock", Opt_dioread_nolock),
+ fsparam_flag ("nodioread_nolock", Opt_dioread_lock),
+ fsparam_flag ("dioread_lock", Opt_dioread_lock),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ fsparam_u32 ("init_itable", Opt_init_itable),
+ fsparam_flag ("init_itable", Opt_init_itable),
+ fsparam_flag ("noinit_itable", Opt_noinit_itable),
+#ifdef CONFIG_EXT4_DEBUG
+ fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
+ fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
+#endif
+ fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb),
+ fsparam_flag ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_string ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_flag ("inlinecrypt", Opt_inlinecrypt),
+ fsparam_flag ("nombcache", Opt_nombcache),
+ fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */
+ fsparam_flag ("prefetch_block_bitmaps",
+ Opt_removed),
+ fsparam_flag ("no_prefetch_block_bitmaps",
+ Opt_no_prefetch_block_bitmaps),
+ fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan),
+ fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */
+ {}
+};
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
@@ -1821,90 +1873,22 @@ static const char deprecated_msg[] =
"Mount option \"%s\" will be removed by %s\n"
"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
-#ifdef CONFIG_QUOTA
-static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
- int ret = -1;
-
- if (sb_any_quota_loaded(sb) && !old_qname) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
- }
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO, "Journaled quota options "
- "ignored when QUOTA feature is enabled");
- return 1;
- }
- qname = match_strdup(args);
- if (!qname) {
- ext4_msg(sb, KERN_ERR,
- "Not enough memory for storing quotafile name");
- return -1;
- }
- if (old_qname) {
- if (strcmp(old_qname, qname) == 0)
- ret = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified",
- QTYPE2NAME(qtype));
- goto errout;
- }
- if (strchr(qname, '/')) {
- ext4_msg(sb, KERN_ERR,
- "quotafile must be on filesystem root");
- goto errout;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
- set_opt(sb, QUOTA);
- return 1;
-errout:
- kfree(qname);
- return ret;
-}
-
-static int clear_qf_name(struct super_block *sb, int qtype)
-{
-
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *old_qname = get_qf_name(sb, sbi, qtype);
-
- if (sb_any_quota_loaded(sb) && old_qname) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
- " when quota turned on");
- return -1;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
- synchronize_rcu();
- kfree(old_qname);
- return 1;
-}
-#endif
-
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
#define MOPT_NOSUPPORT 0x0004
#define MOPT_EXPLICIT 0x0008
-#define MOPT_CLEAR_ERR 0x0010
-#define MOPT_GTE0 0x0020
#ifdef CONFIG_QUOTA
#define MOPT_Q 0
-#define MOPT_QFMT 0x0040
+#define MOPT_QFMT 0x0010
#else
#define MOPT_Q MOPT_NOSUPPORT
#define MOPT_QFMT MOPT_NOSUPPORT
#endif
-#define MOPT_DATAJ 0x0080
-#define MOPT_NO_EXT2 0x0100
-#define MOPT_NO_EXT3 0x0200
+#define MOPT_NO_EXT2 0x0020
+#define MOPT_NO_EXT3 0x0040
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
-#define MOPT_STRING 0x0400
-#define MOPT_SKIP 0x0800
-#define MOPT_2 0x1000
+#define MOPT_SKIP 0x0080
+#define MOPT_2 0x0100
static const struct mount_opts {
int token;
@@ -1937,40 +1921,17 @@ static const struct mount_opts {
EXT4_MOUNT_JOURNAL_CHECKSUM),
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
- {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
- {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
+ {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
- {Opt_commit, 0, MOPT_GTE0},
- {Opt_max_batch_time, 0, MOPT_GTE0},
- {Opt_min_batch_time, 0, MOPT_GTE0},
- {Opt_inode_readahead_blks, 0, MOPT_GTE0},
- {Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
- {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
- MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
- {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
- MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
- {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
- MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
- {Opt_stripe, 0, MOPT_GTE0},
- {Opt_resuid, 0, MOPT_GTE0},
- {Opt_resgid, 0, MOPT_GTE0},
- {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
- {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
- MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_dax_type, 0, MOPT_EXT4_ONLY},
+ {Opt_journal_dev, 0, MOPT_NO_EXT2},
+ {Opt_journal_path, 0, MOPT_NO_EXT2},
+ {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
+ {Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1982,7 +1943,6 @@ static const struct mount_opts {
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
- {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
MOPT_SET | MOPT_Q},
@@ -1993,499 +1953,1005 @@ static const struct mount_opts {
{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
MOPT_CLEAR | MOPT_Q},
- {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
- {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
- {Opt_offusrjquota, 0, MOPT_Q},
- {Opt_offgrpjquota, 0, MOPT_Q},
- {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
- {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
- {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
- {Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_STRING},
+ {Opt_usrjquota, 0, MOPT_Q},
+ {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_jqfmt, 0, MOPT_QFMT},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
- {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
- {Opt_fc_debug_max_replay, 0, MOPT_GTE0},
#endif
{Opt_err, 0, 0}
};
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
__u16 magic;
char *name;
- char *version;
+ unsigned int version;
} ext4_sb_encoding_map[] = {
- {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
+ {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};
-static int ext4_sb_read_encoding(const struct ext4_super_block *es,
- const struct ext4_sb_encodings **encoding,
- __u16 *flags)
+static const struct ext4_sb_encodings *
+ext4_sb_read_encoding(const struct ext4_super_block *es)
{
__u16 magic = le16_to_cpu(es->s_encoding);
int i;
for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
if (magic == ext4_sb_encoding_map[i].magic)
- break;
-
- if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
- return -EINVAL;
+ return &ext4_sb_encoding_map[i];
- *encoding = &ext4_sb_encoding_map[i];
- *flags = le16_to_cpu(es->s_encoding_flags);
-
- return 0;
+ return NULL;
}
#endif
-static int ext4_set_test_dummy_encryption(struct super_block *sb,
- const char *opt,
- const substring_t *arg,
- bool is_remount)
+static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
{
#ifdef CONFIG_FS_ENCRYPTION
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if (is_remount && !sbi->s_dummy_enc_policy.policy) {
- ext4_msg(sb, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
- return -1;
- }
- err = fscrypt_set_test_dummy_encryption(sb, arg->from,
+ err = fscrypt_set_test_dummy_encryption(sb, arg,
&sbi->s_dummy_enc_policy);
if (err) {
- if (err == -EEXIST)
- ext4_msg(sb, KERN_WARNING,
- "Can't change test_dummy_encryption on remount");
- else if (err == -EINVAL)
- ext4_msg(sb, KERN_WARNING,
- "Value of option \"%s\" is unrecognized", opt);
- else
- ext4_msg(sb, KERN_WARNING,
- "Error processing option \"%s\" [%d]",
- opt, err);
- return -1;
+ ext4_msg(sb, KERN_WARNING,
+ "Error while setting test dummy encryption [%d]", err);
+ return err;
}
ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
-#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
#endif
- return 1;
+ return 0;
}
-struct ext4_parsed_options {
- unsigned long journal_devnum;
- unsigned int journal_ioprio;
- int mb_optimize_scan;
+#define EXT4_SPEC_JQUOTA (1 << 0)
+#define EXT4_SPEC_JQFMT (1 << 1)
+#define EXT4_SPEC_DATAJ (1 << 2)
+#define EXT4_SPEC_SB_BLOCK (1 << 3)
+#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
+#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
+#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6)
+#define EXT4_SPEC_s_want_extra_isize (1 << 7)
+#define EXT4_SPEC_s_max_batch_time (1 << 8)
+#define EXT4_SPEC_s_min_batch_time (1 << 9)
+#define EXT4_SPEC_s_inode_readahead_blks (1 << 10)
+#define EXT4_SPEC_s_li_wait_mult (1 << 11)
+#define EXT4_SPEC_s_max_dir_size_kb (1 << 12)
+#define EXT4_SPEC_s_stripe (1 << 13)
+#define EXT4_SPEC_s_resuid (1 << 14)
+#define EXT4_SPEC_s_resgid (1 << 15)
+#define EXT4_SPEC_s_commit_interval (1 << 16)
+#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
+#define EXT4_SPEC_s_sb_block (1 << 18)
+#define EXT4_SPEC_mb_optimize_scan (1 << 19)
+
+struct ext4_fs_context {
+ char *s_qf_names[EXT4_MAXQUOTAS];
+ char *test_dummy_enc_arg;
+ int s_jquota_fmt; /* Format of quota to use */
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ unsigned short qname_spec;
+ unsigned long vals_s_flags; /* Bits to set in s_flags */
+ unsigned long mask_s_flags; /* Bits changed in s_flags */
+ unsigned long journal_devnum;
+ unsigned long s_commit_interval;
+ unsigned long s_stripe;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_want_extra_isize;
+ unsigned int s_li_wait_mult;
+ unsigned int s_max_dir_size_kb;
+ unsigned int journal_ioprio;
+ unsigned int vals_s_mount_opt;
+ unsigned int mask_s_mount_opt;
+ unsigned int vals_s_mount_opt2;
+ unsigned int mask_s_mount_opt2;
+ unsigned long vals_s_mount_flags;
+ unsigned long mask_s_mount_flags;
+ unsigned int opt_flags; /* MOPT flags */
+ unsigned int spec;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ ext4_fsblk_t s_sb_block;
};
-static int handle_mount_opt(struct super_block *sb, char *opt, int token,
- substring_t *args, struct ext4_parsed_options *parsed_opts,
- int is_remount)
+static void ext4_fc_free(struct fs_context *fc)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int i;
+
+ if (!ctx)
+ return;
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(ctx->s_qf_names[i]);
+
+ kfree(ctx->test_dummy_enc_arg);
+ kfree(ctx);
+}
+
+int ext4_init_fs_context(struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &ext4_context_ops;
+
+ return 0;
+}
+
+#ifdef CONFIG_QUOTA
+/*
+ * Note the name of the specified quota file.
+ */
+static int note_qf_name(struct fs_context *fc, int qtype,
+ struct fs_parameter *param)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *qname;
+
+ if (param->size < 1) {
+ ext4_msg(NULL, KERN_ERR, "Missing quota name");
+ return -EINVAL;
+ }
+ if (strchr(param->string, '/')) {
+ ext4_msg(NULL, KERN_ERR,
+ "quotafile must be on filesystem root");
+ return -EINVAL;
+ }
+ if (ctx->s_qf_names[qtype]) {
+ if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
+ ext4_msg(NULL, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
+ if (!qname) {
+ ext4_msg(NULL, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -ENOMEM;
+ }
+ ctx->s_qf_names[qtype] = qname;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
+ return 0;
+}
+
+/*
+ * Clear the name of the specified quota file.
+ */
+static int unnote_qf_name(struct fs_context *fc, int qtype)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+
+ if (ctx->s_qf_names[qtype])
+ kfree(ctx->s_qf_names[qtype]);
+
+ ctx->s_qf_names[qtype] = NULL;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
+ return 0;
+}
+#endif
+
+#define EXT4_SET_CTX(name) \
+static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
+ unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name |= flag; \
+}
+
+#define EXT4_CLEAR_CTX(name) \
+static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
+ unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name &= ~flag; \
+}
+
+#define EXT4_TEST_CTX(name) \
+static inline unsigned long \
+ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
+{ \
+ return (ctx->vals_s_##name & flag); \
+}
+
+EXT4_SET_CTX(flags); /* set only */
+EXT4_SET_CTX(mount_opt);
+EXT4_CLEAR_CTX(mount_opt);
+EXT4_TEST_CTX(mount_opt);
+EXT4_SET_CTX(mount_opt2);
+EXT4_CLEAR_CTX(mount_opt2);
+EXT4_TEST_CTX(mount_opt2);
+
+static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
+{
+ set_bit(bit, &ctx->mask_s_mount_flags);
+ set_bit(bit, &ctx->vals_s_mount_flags);
+}
+
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
const struct mount_opts *m;
+ int is_remount;
kuid_t uid;
kgid_t gid;
- int arg = 0;
+ int token;
-#ifdef CONFIG_QUOTA
- if (token == Opt_usrjquota)
- return set_qf_name(sb, USRQUOTA, &args[0]);
- else if (token == Opt_grpjquota)
- return set_qf_name(sb, GRPQUOTA, &args[0]);
- else if (token == Opt_offusrjquota)
- return clear_qf_name(sb, USRQUOTA);
- else if (token == Opt_offgrpjquota)
- return clear_qf_name(sb, GRPQUOTA);
-#endif
- switch (token) {
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
- break;
- case Opt_sb:
- return 1; /* handled by get_sb_block() */
- case Opt_removed:
- ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
- return 1;
- case Opt_abort:
- ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
- return 1;
- case Opt_i_version:
- sb->s_flags |= SB_I_VERSION;
- return 1;
- case Opt_lazytime:
- sb->s_flags |= SB_LAZYTIME;
- return 1;
- case Opt_nolazytime:
- sb->s_flags &= ~SB_LAZYTIME;
- return 1;
- case Opt_inlinecrypt:
-#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- sb->s_flags |= SB_INLINECRYPT;
-#else
- ext4_msg(sb, KERN_ERR, "inline encryption not supported");
-#endif
- return 1;
- }
+ token = fs_parse(fc, ext4_param_specs, param, &result);
+ if (token < 0)
+ return token;
+ is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
for (m = ext4_mount_opts; m->token != Opt_err; m++)
if (token == m->token)
break;
- if (m->token == Opt_err) {
- ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
- "or missing value", opt);
- return -1;
- }
+ ctx->opt_flags |= m->flags;
- if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext2", opt);
- return -1;
- }
- if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext3", opt);
- return -1;
- }
-
- if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
- return -1;
- if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
- return -1;
if (m->flags & MOPT_EXPLICIT) {
if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
- set_opt2(sb, EXPLICIT_DELALLOC);
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
- set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ ctx_set_mount_opt2(ctx,
+ EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
} else
- return -1;
- }
- if (m->flags & MOPT_CLEAR_ERR)
- clear_opt(sb, ERRORS_MASK);
- if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
- return -1;
+ return -EINVAL;
}
if (m->flags & MOPT_NOSUPPORT) {
- ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
- } else if (token == Opt_commit) {
- if (arg == 0)
- arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
- else if (arg > INT_MAX / HZ) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR, "%s option not supported",
+ param->key);
+ return 0;
+ }
+
+ switch (token) {
+#ifdef CONFIG_QUOTA
+ case Opt_usrjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, USRQUOTA);
+ else
+ return note_qf_name(fc, USRQUOTA, param);
+ case Opt_grpjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, GRPQUOTA);
+ else
+ return note_qf_name(fc, GRPQUOTA, param);
+#endif
+ case Opt_noacl:
+ case Opt_nouser_xattr:
+ ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5");
+ break;
+ case Opt_sb:
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Ignoring %s option on remount", param->key);
+ } else {
+ ctx->s_sb_block = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_sb_block;
+ }
+ return 0;
+ case Opt_removed:
+ ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
+ param->key);
+ return 0;
+ case Opt_abort:
+ ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
+ return 0;
+ case Opt_i_version:
+ ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
+ ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n");
+ ctx_set_flags(ctx, SB_I_VERSION);
+ return 0;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ ctx_set_flags(ctx, SB_INLINECRYPT);
+#else
+ ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
+#endif
+ return 0;
+ case Opt_errors:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ return 0;
+#ifdef CONFIG_QUOTA
+ case Opt_jqfmt:
+ ctx->s_jquota_fmt = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JQFMT;
+ return 0;
+#endif
+ case Opt_data:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ ctx->spec |= EXT4_SPEC_DATAJ;
+ return 0;
+ case Opt_commit:
+ if (result.uint_32 == 0)
+ ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (result.uint_32 > INT_MAX / HZ) {
+ ext4_msg(NULL, KERN_ERR,
"Invalid commit interval %d, "
"must be smaller than %d",
- arg, INT_MAX / HZ);
- return -1;
+ result.uint_32, INT_MAX / HZ);
+ return -EINVAL;
}
- sbi->s_commit_interval = HZ * arg;
- } else if (token == Opt_debug_want_extra_isize) {
- if ((arg & 1) ||
- (arg < 4) ||
- (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
- ext4_msg(sb, KERN_ERR,
- "Invalid want_extra_isize %d", arg);
- return -1;
+ ctx->s_commit_interval = HZ * result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_commit_interval;
+ return 0;
+ case Opt_debug_want_extra_isize:
+ if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d", result.uint_32);
+ return -EINVAL;
}
- sbi->s_want_extra_isize = arg;
- } else if (token == Opt_max_batch_time) {
- sbi->s_max_batch_time = arg;
- } else if (token == Opt_min_batch_time) {
- sbi->s_min_batch_time = arg;
- } else if (token == Opt_inode_readahead_blks) {
- if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
- ext4_msg(sb, KERN_ERR,
+ ctx->s_want_extra_isize = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_want_extra_isize;
+ return 0;
+ case Opt_max_batch_time:
+ ctx->s_max_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_batch_time;
+ return 0;
+ case Opt_min_batch_time:
+ ctx->s_min_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_min_batch_time;
+ return 0;
+ case Opt_inode_readahead_blks:
+ if (result.uint_32 &&
+ (result.uint_32 > (1 << 30) ||
+ !is_power_of_2(result.uint_32))) {
+ ext4_msg(NULL, KERN_ERR,
"EXT4-fs: inode_readahead_blks must be "
"0 or a power of 2 smaller than 2^31");
- return -1;
+ return -EINVAL;
}
- sbi->s_inode_readahead_blks = arg;
- } else if (token == Opt_init_itable) {
- set_opt(sb, INIT_INODE_TABLE);
- if (!args->from)
- arg = EXT4_DEF_LI_WAIT_MULT;
- sbi->s_li_wait_mult = arg;
- } else if (token == Opt_max_dir_size_kb) {
- sbi->s_max_dir_size_kb = arg;
+ ctx->s_inode_readahead_blks = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
+ return 0;
+ case Opt_init_itable:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
+ ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (param->type == fs_value_is_string)
+ ctx->s_li_wait_mult = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_li_wait_mult;
+ return 0;
+ case Opt_max_dir_size_kb:
+ ctx->s_max_dir_size_kb = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
+ return 0;
#ifdef CONFIG_EXT4_DEBUG
- } else if (token == Opt_fc_debug_max_replay) {
- sbi->s_fc_debug_max_replay = arg;
+ case Opt_fc_debug_max_replay:
+ ctx->s_fc_debug_max_replay = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
+ return 0;
#endif
- } else if (token == Opt_stripe) {
- sbi->s_stripe = arg;
- } else if (token == Opt_resuid) {
- uid = make_kuid(current_user_ns(), arg);
+ case Opt_stripe:
+ ctx->s_stripe = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_stripe;
+ return 0;
+ case Opt_resuid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
if (!uid_valid(uid)) {
- ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
+ result.uint_32);
+ return -EINVAL;
}
- sbi->s_resuid = uid;
- } else if (token == Opt_resgid) {
- gid = make_kgid(current_user_ns(), arg);
+ ctx->s_resuid = uid;
+ ctx->spec |= EXT4_SPEC_s_resuid;
+ return 0;
+ case Opt_resgid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
if (!gid_valid(gid)) {
- ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
+ result.uint_32);
+ return -EINVAL;
}
- sbi->s_resgid = gid;
- } else if (token == Opt_journal_dev) {
+ ctx->s_resgid = gid;
+ ctx->spec |= EXT4_SPEC_s_resgid;
+ return 0;
+ case Opt_journal_dev:
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
+ return -EINVAL;
}
- parsed_opts->journal_devnum = arg;
- } else if (token == Opt_journal_path) {
- char *journal_path;
+ ctx->journal_devnum = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
+ return 0;
+ case Opt_journal_path:
+ {
struct inode *journal_inode;
struct path path;
int error;
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
- }
- journal_path = match_strdup(&args[0]);
- if (!journal_path) {
- ext4_msg(sb, KERN_ERR, "error: could not dup "
- "journal device string");
- return -1;
+ return -EINVAL;
}
- error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ error = fs_lookup_param(fc, param, 1, &path);
if (error) {
- ext4_msg(sb, KERN_ERR, "error: could not find "
- "journal device path: error %d", error);
- kfree(journal_path);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "error: could not find "
+ "journal device path");
+ return -EINVAL;
}
journal_inode = d_inode(path.dentry);
- if (!S_ISBLK(journal_inode->i_mode)) {
- ext4_msg(sb, KERN_ERR, "error: journal path %s "
- "is not a block device", journal_path);
- path_put(&path);
- kfree(journal_path);
- return -1;
- }
-
- parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
path_put(&path);
- kfree(journal_path);
- } else if (token == Opt_journal_ioprio) {
- if (arg > 7) {
- ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ return 0;
+ }
+ case Opt_journal_ioprio:
+ if (result.uint_32 > 7) {
+ ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
" (must be 0-7)");
- return -1;
- }
- parsed_opts->journal_ioprio =
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
- } else if (token == Opt_test_dummy_encryption) {
- return ext4_set_test_dummy_encryption(sb, opt, &args[0],
- is_remount);
- } else if (m->flags & MOPT_DATAJ) {
- if (is_remount) {
- if (!sbi->s_journal)
- ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change data mode on remount");
- return -1;
- }
- } else {
- clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= m->mount_opt;
+ return -EINVAL;
}
-#ifdef CONFIG_QUOTA
- } else if (m->flags & MOPT_QFMT) {
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != m->mount_opt) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
+ ctx->journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
+ ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
+ return 0;
+ case Opt_test_dummy_encryption:
+#ifdef CONFIG_FS_ENCRYPTION
+ if (param->type == fs_value_is_flag) {
+ ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
+ ctx->test_dummy_enc_arg = NULL;
+ return 0;
}
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO,
- "Quota format mount options ignored "
- "when QUOTA feature is enabled");
- return 1;
+ if (*param->string &&
+ !(!strcmp(param->string, "v1") ||
+ !strcmp(param->string, "v2"))) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized",
+ param->key);
+ return -EINVAL;
}
- sbi->s_jquota_fmt = m->mount_opt;
+ ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
+ ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size,
+ GFP_KERNEL);
+#else
+ ext4_msg(NULL, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
#endif
- } else if (token == Opt_dax || token == Opt_dax_always ||
- token == Opt_dax_inode || token == Opt_dax_never) {
+ return 0;
+ case Opt_dax:
+ case Opt_dax_type:
#ifdef CONFIG_FS_DAX
- switch (token) {
+ {
+ int type = (token == Opt_dax) ?
+ Opt_dax : result.uint_32;
+
+ switch (type) {
case Opt_dax:
case Opt_dax_always:
- if (is_remount &&
- (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
- (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
- fail_dax_change_remount:
- ext4_msg(sb, KERN_ERR, "can't change "
- "dax mount option while remounting");
- return -1;
- }
- if (is_remount &&
- (test_opt(sb, DATA_FLAGS) ==
- EXT4_MOUNT_JOURNAL_DATA)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- return -1;
- }
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
- sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
break;
case Opt_dax_never:
- if (is_remount &&
- (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
- (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
- goto fail_dax_change_remount;
- sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
break;
case Opt_dax_inode:
- if (is_remount &&
- ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
- (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
- !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
- goto fail_dax_change_remount;
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
- sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
/* Strictly for printing options */
- sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
break;
}
+ return 0;
+ }
#else
- ext4_msg(sb, KERN_INFO, "dax option not supported");
- sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
- return -1;
+ ext4_msg(NULL, KERN_INFO, "dax option not supported");
+ return -EINVAL;
#endif
- } else if (token == Opt_data_err_abort) {
- sbi->s_mount_opt |= m->mount_opt;
- } else if (token == Opt_data_err_ignore) {
- sbi->s_mount_opt &= ~m->mount_opt;
- } else if (token == Opt_mb_optimize_scan) {
- if (arg != 0 && arg != 1) {
- ext4_msg(sb, KERN_WARNING,
+ case Opt_data_err:
+ if (result.uint_32 == Opt_data_err_abort)
+ ctx_set_mount_opt(ctx, m->mount_opt);
+ else if (result.uint_32 == Opt_data_err_ignore)
+ ctx_clear_mount_opt(ctx, m->mount_opt);
+ return 0;
+ case Opt_mb_optimize_scan:
+ if (result.int_32 == 1) {
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else if (result.int_32 == 0) {
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else {
+ ext4_msg(NULL, KERN_WARNING,
"mb_optimize_scan should be set to 0 or 1.");
- return -1;
+ return -EINVAL;
}
- parsed_opts->mb_optimize_scan = arg;
- } else {
- if (!args->from)
- arg = 1;
+ return 0;
+ }
+
+ /*
+ * At this point we should only be getting options requiring MOPT_SET,
+ * or MOPT_CLEAR. Anything else is a bug
+ */
+ if (m->token == Opt_err) {
+ ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
+ param->key);
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ else {
+ unsigned int set = 0;
+
+ if ((param->type == fs_value_is_flag) ||
+ result.uint_32 > 0)
+ set = 1;
+
if (m->flags & MOPT_CLEAR)
- arg = !arg;
+ set = !set;
else if (unlikely(!(m->flags & MOPT_SET))) {
- ext4_msg(sb, KERN_WARNING,
- "buggy handling of option %s", opt);
+ ext4_msg(NULL, KERN_WARNING,
+ "buggy handling of option %s",
+ param->key);
WARN_ON(1);
- return -1;
+ return -EINVAL;
}
if (m->flags & MOPT_2) {
- if (arg != 0)
- sbi->s_mount_opt2 |= m->mount_opt;
+ if (set != 0)
+ ctx_set_mount_opt2(ctx, m->mount_opt);
else
- sbi->s_mount_opt2 &= ~m->mount_opt;
+ ctx_clear_mount_opt2(ctx, m->mount_opt);
} else {
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
+ if (set != 0)
+ ctx_set_mount_opt(ctx, m->mount_opt);
else
- sbi->s_mount_opt &= ~m->mount_opt;
+ ctx_clear_mount_opt(ctx, m->mount_opt);
}
}
- return 1;
+
+ return 0;
}
-static int parse_options(char *options, struct super_block *sb,
- struct ext4_parsed_options *ret_opts,
- int is_remount)
+static int parse_options(struct fs_context *fc, char *options)
{
- struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
- char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
- substring_t args[MAX_OPT_ARGS];
- int token;
+ struct fs_parameter param;
+ int ret;
+ char *key;
if (!options)
- return 1;
+ return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- if (handle_mount_opt(sb, p, token, args, ret_opts,
- is_remount) < 0)
- return 0;
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ param.type = fs_value_is_flag;
+ param.string = NULL;
+
+ if (value) {
+ if (value == key)
+ continue;
+
+ *value++ = 0;
+ v_len = strlen(value);
+ param.string = kmemdup_nul(value, v_len,
+ GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ param.type = fs_value_is_string;
+ }
+
+ param.key = key;
+ param.size = v_len;
+
+ ret = ext4_parse_param(fc, &param);
+ if (param.string)
+ kfree(param.string);
+ if (ret < 0)
+ return ret;
+ }
}
+
+ ret = ext4_validate_options(fc);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static int parse_apply_sb_mount_options(struct super_block *sb,
+ struct ext4_fs_context *m_ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *s_mount_opts = NULL;
+ struct ext4_fs_context *s_ctx = NULL;
+ struct fs_context *fc = NULL;
+ int ret = -ENOMEM;
+
+ if (!sbi->s_es->s_mount_opts[0])
+ return 0;
+
+ s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+ sizeof(sbi->s_es->s_mount_opts),
+ GFP_KERNEL);
+ if (!s_mount_opts)
+ return ret;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ goto out_free;
+
+ s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!s_ctx)
+ goto out_free;
+
+ fc->fs_private = s_ctx;
+ fc->s_fs_info = sbi;
+
+ ret = parse_options(fc, s_mount_opts);
+ if (ret < 0)
+ goto parse_failed;
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0) {
+parse_failed:
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+ ret = 0;
+ goto out_free;
+ }
+
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
+ m_ctx->journal_devnum = s_ctx->journal_devnum;
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
+ m_ctx->journal_ioprio = s_ctx->journal_ioprio;
+
+ ret = ext4_apply_options(fc, sb);
+
+out_free:
+ kfree(s_ctx);
+ kfree(fc);
+ kfree(s_mount_opts);
+ return ret;
+}
+
+static void ext4_apply_quota_options(struct fs_context *fc,
+ struct super_block *sb)
+{
#ifdef CONFIG_QUOTA
+ bool quota_feature = ext4_has_feature_quota(sb);
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *qname;
+ int i;
+
+ if (quota_feature)
+ return;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ qname = ctx->s_qf_names[i]; /* May be NULL */
+ if (qname)
+ set_opt(sb, QUOTA);
+ ctx->s_qf_names[i] = NULL;
+ qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
+ lockdep_is_held(&sb->s_umount));
+ if (qname)
+ kfree_rcu(qname);
+ }
+ }
+
+ if (ctx->spec & EXT4_SPEC_JQFMT)
+ sbi->s_jquota_fmt = ctx->s_jquota_fmt;
+#endif
+}
+
+/*
+ * Check quota settings consistency.
+ */
+static int ext4_check_quota_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool quota_feature = ext4_has_feature_quota(sb);
+ bool quota_loaded = sb_any_quota_loaded(sb);
+ bool usr_qf_name, grp_qf_name, usrquota, grpquota;
+ int quota_flags, i;
+
/*
* We do the test below only for project quotas. 'usrquota' and
* 'grpquota' mount options are allowed even without quota feature
* to support legacy quotas in quota files.
*/
- if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
- ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
+ !ext4_has_feature_project(sb)) {
+ ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
"Cannot enable project quota enforcement.");
- return 0;
+ return -EINVAL;
}
- usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
- grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
- if (usr_qf_name || grp_qf_name) {
- if (test_opt(sb, USRQUOTA) && usr_qf_name)
- clear_opt(sb, USRQUOTA);
- if (test_opt(sb, GRPQUOTA) && grp_qf_name)
- clear_opt(sb, GRPQUOTA);
+ quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
+ if (quota_loaded &&
+ ctx->mask_s_mount_opt & quota_flags &&
+ !ctx_test_mount_opt(ctx, quota_flags))
+ goto err_quota_change;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ if (quota_loaded &&
+ !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
+ goto err_jquota_change;
+
+ if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
+ strcmp(get_qf_name(sb, sbi, i),
+ ctx->s_qf_names[i]) != 0)
+ goto err_jquota_specified;
+ }
- if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
- ext4_msg(sb, KERN_ERR, "old and new quota "
- "format mixing");
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO,
+ "Journaled quota options ignored when "
+ "QUOTA feature is enabled");
return 0;
}
+ }
- if (!sbi->s_jquota_fmt) {
- ext4_msg(sb, KERN_ERR, "journaled quota format "
- "not specified");
+ if (ctx->spec & EXT4_SPEC_JQFMT) {
+ if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
+ goto err_jquota_change;
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO, "Quota format mount options "
+ "ignored when QUOTA feature is enabled");
return 0;
}
}
+
+ /* Make sure we don't mix old and new quota format */
+ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
+ ctx->s_qf_names[USRQUOTA]);
+ grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
+ ctx->s_qf_names[GRPQUOTA]);
+
+ usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ test_opt(sb, USRQUOTA));
+
+ grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
+ test_opt(sb, GRPQUOTA));
+
+ if (usr_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+ usrquota = false;
+ }
+ if (grp_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+ grpquota = false;
+ }
+
+ if (usr_qf_name || grp_qf_name) {
+ if (usrquota || grpquota) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
+
+ if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
+ ext4_msg(NULL, KERN_ERR, "journaled quota format "
+ "not specified");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+
+err_quota_change:
+ ext4_msg(NULL, KERN_ERR,
+ "Cannot change quota options when quota turned on");
+ return -EINVAL;
+err_jquota_change:
+ ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
+ "options when quota turned on");
+ return -EINVAL;
+err_jquota_specified:
+ ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
+ QTYPE2NAME(i));
+ return -EINVAL;
+#else
+ return 0;
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
+}
+
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+ int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+
+ if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext2");
+ return -EINVAL;
+ }
+ if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext3");
+ return -EINVAL;
+ }
+
+ if (ctx->s_want_extra_isize >
+ (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d",
+ ctx->s_want_extra_isize);
+ return -EINVAL;
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (blocksize < PAGE_SIZE)
- ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
+ ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
"experimental mount option 'dioread_nolock' "
"for blocksize < PAGE_SIZE");
}
+
+#ifdef CONFIG_FS_ENCRYPTION
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) &&
+ is_remount && !sbi->s_dummy_enc_policy.policy) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Can't set test_dummy_encryption on remount");
+ return -1;
+ }
+#endif
+
+ if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
+ if (!sbi->s_journal) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting file system with no journal "
+ "so ignoring journalled data option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
+ test_opt(sb, DATA_FLAGS)) {
+ ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
+ "on remount");
+ return -EINVAL;
+ }
+ }
+
+ if (is_remount) {
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(NULL, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+fail_dax_change_remount:
+ ext4_msg(NULL, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -EINVAL;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
+ goto fail_dax_change_remount;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
+ goto fail_dax_change_remount;
+ }
+ }
+
+ return ext4_check_quota_consistency(fc, sb);
+}
+
+static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+ int ret = 0;
+
+ sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
+ sbi->s_mount_opt |= ctx->vals_s_mount_opt;
+ sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
+ sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
+ sbi->s_mount_flags &= ~ctx->mask_s_mount_flags;
+ sbi->s_mount_flags |= ctx->vals_s_mount_flags;
+ sb->s_flags &= ~ctx->mask_s_flags;
+ sb->s_flags |= ctx->vals_s_flags;
+
+ /*
+ * i_version differs from common mount option iversion so we have
+ * to let vfs know that it was set, otherwise it would get cleared
+ * on remount
+ */
+ if (ctx->mask_s_flags & SB_I_VERSION)
+ fc->sb_flags |= SB_I_VERSION;
+
+#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
+ APPLY(s_commit_interval);
+ APPLY(s_stripe);
+ APPLY(s_max_batch_time);
+ APPLY(s_min_batch_time);
+ APPLY(s_want_extra_isize);
+ APPLY(s_inode_readahead_blks);
+ APPLY(s_max_dir_size_kb);
+ APPLY(s_li_wait_mult);
+ APPLY(s_resgid);
+ APPLY(s_resuid);
+
+#ifdef CONFIG_EXT4_DEBUG
+ APPLY(s_fc_debug_max_replay);
+#endif
+
+ ext4_apply_quota_options(fc, sb);
+
+ if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)
+ ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg);
+
+ return ret;
+}
+
+
+static int ext4_validate_options(struct fs_context *fc)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *usr_qf_name, *grp_qf_name;
+
+ usr_qf_name = ctx->s_qf_names[USRQUOTA];
+ grp_qf_name = ctx->s_qf_names[GRPQUOTA];
+
+ if (usr_qf_name || grp_qf_name) {
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
+ }
+#endif
return 1;
}
@@ -2526,12 +2992,12 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- const struct match_token *t;
+ const struct fs_parameter_spec *spec;
- for (t = tokens; t->token != Opt_err; t++)
- if (t->token == token && !strchr(t->pattern, '='))
+ for (spec = ext4_param_specs; spec->name != NULL; spec++)
+ if (spec->opt == token && !spec->type)
break;
- return t->pattern;
+ return spec->name;
}
/*
@@ -2557,7 +3023,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
+ m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2705,8 +3171,6 @@ done:
EXT4_BLOCKS_PER_GROUP(sb),
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt, sbi->s_mount_opt2);
-
- cleancache_init_fs(sb);
return err;
}
@@ -3027,8 +3491,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
+ loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
+ unsigned int ppb = 1 << (bits - 2);
/*
* This is calculated to be the largest file size for a dense, block
@@ -3060,27 +3525,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
+ /* Compute how many blocks we can address by block tree */
+ res += ppb;
+ res += ppb * ppb;
+ res += ((loff_t)ppb) * ppb * ppb;
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
+ /* Does block tree limit file size? */
+ if (res + meta_blocks <= upper_limit)
+ goto check_lfs;
+
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
+ DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return (loff_t)res;
+ return res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -3165,7 +3645,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_UNICODE
+#if !IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
@@ -3263,9 +3743,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3782,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3698,9 +4177,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -3870,21 +4351,52 @@ static void ext4_setup_csum_trigger(struct super_block *sb,
sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
}
-static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+static void ext4_free_sbi(struct ext4_sb_info *sbi)
+{
+ if (!sbi)
+ return;
+
+ kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev);
+ kfree(sbi);
+}
+
+static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return NULL;
+
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+
+ if (!sbi->s_blockgroup_lock)
+ goto err_out;
+
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+ return sbi;
+err_out:
+ fs_put_dax(sbi->s_daxdev);
+ kfree(sbi);
+ return NULL;
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
- char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh, **group_desc;
struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct flex_groups **flex_groups;
ext4_fsblk_t block;
- ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
unsigned long offset = 0;
unsigned long def_mount_opts;
struct inode *root;
- const char *descr;
int ret = -ENOMEM;
int blocksize, clustersize;
unsigned int db_count;
@@ -3893,32 +4405,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__u64 blocks_count;
int err = 0;
ext4_group_t first_not_zeroed;
- struct ext4_parsed_options parsed_opts;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
/* Set defaults for the variables that will be set during parsing */
- parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- parsed_opts.journal_devnum = 0;
- parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
-
- if ((data && !orig_data) || !sbi)
- goto out_free_base;
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- sbi->s_daxdev = dax_dev;
- sbi->s_blockgroup_lock =
- kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock)
- goto out_free_base;
-
- sb->s_fs_info = sbi;
- sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sb_block = sb_block;
sbi->s_sectors_written_start =
part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
- /* Cleanup superblock name */
- strreplace(sb->s_id, '/', '!');
-
/* -EINVAL is default */
ret = -EINVAL;
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
@@ -3932,10 +4428,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* block sizes. We need to calculate the offset from buffer start.
*/
if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
} else {
- logical_sb_block = sb_block;
+ logical_sb_block = sbi->s_sb_block;
}
bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
@@ -4140,31 +4636,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- if (sbi->s_es->s_mount_opts[0]) {
- char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
- sizeof(sbi->s_es->s_mount_opts),
- GFP_KERNEL);
- if (!s_mount_opts)
- goto failed_mount;
- if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) {
- ext4_msg(sb, KERN_WARNING,
- "failed to parse options in superblock: %s",
- s_mount_opts);
- }
- kfree(s_mount_opts);
- }
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
sbi->s_def_mount_opt = sbi->s_mount_opt;
- if (!parse_options((char *) data, sb, &parsed_opts, 0))
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
+ goto failed_mount;
+
+ err = ext4_apply_options(fc, sb);
+ if (err < 0)
goto failed_mount;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
const struct ext4_sb_encodings *encoding_info;
struct unicode_map *encoding;
- __u16 encoding_flags;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
- if (ext4_sb_read_encoding(es, &encoding_info,
- &encoding_flags)) {
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
ext4_msg(sb, KERN_ERR,
"Encoding requested by superblock is unknown");
goto failed_mount;
@@ -4173,15 +4666,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
encoding = utf8_load(encoding_info->version);
if (IS_ERR(encoding)) {
ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%s "
+ "can't mount with superblock charset: %s-%u.%u.%u "
"not supported by the kernel. flags: 0x%x.",
- encoding_info->name, encoding_info->version,
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
encoding_flags);
goto failed_mount;
}
ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%s with flags 0x%hx", encoding_info->name,
- encoding_info->version?:"\b", encoding_flags);
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
sb->s_encoding = encoding;
sb->s_encoding_flags = encoding_flags;
@@ -4293,9 +4792,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
- bdev_nr_sectors(sb->s_bdev)))
- set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ if (sbi->s_daxdev) {
+ if (blocksize == PAGE_SIZE)
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ else
+ ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
+ }
if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
@@ -4331,7 +4833,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
if (IS_ERR(bh)) {
@@ -4474,7 +4976,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto cantfind_ext4;
/* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ blocks_count = sb_bdev_nr_blocks(sb);
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
"exceeds size of device (%llu blocks)",
@@ -4614,14 +5116,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Initialize fast commit stuff */
atomic_set(&sbi->s_fc_subtid, 0);
- atomic_set(&sbi->s_fc_ineligible_updates, 0);
INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ sbi->s_fc_ineligible_tid = 0;
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
@@ -4647,7 +5148,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, parsed_opts.journal_devnum);
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
@@ -4747,7 +5248,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
}
- set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
sbi->s_journal->j_submit_inode_data_buffers =
ext4_journal_submit_inode_data_buffers;
@@ -4788,9 +5289,18 @@ no_journal:
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -4859,12 +5369,12 @@ no_journal:
* turned off by passing "mb_optimize_scan=0". This can also be
* turned on forcefully by passing "mb_optimize_scan=1".
*/
- if (parsed_opts.mb_optimize_scan == 1)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (parsed_opts.mb_optimize_scan == 0)
- clear_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
+ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ }
err = ext4_mb_init(sb);
if (err) {
@@ -4963,15 +5473,6 @@ no_journal:
if (err)
goto failed_mount9;
}
- if (EXT4_SB(sb)->s_journal) {
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- descr = " journalled data mode";
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- descr = " ordered data mode";
- else
- descr = " writeback data mode";
- } else
- descr = "out journal";
if (test_opt(sb, DISCARD)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -4981,14 +5482,6 @@ no_journal:
"the device does not support discard");
}
- if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %.*s%s%s. Quota mode: %s.", descr,
- (int) sizeof(sbi->s_es->s_mount_opts),
- sbi->s_es->s_mount_opts,
- *sbi->s_es->s_mount_opts ? "; " : "", orig_data,
- ext4_quota_mode(sb));
-
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -4999,7 +5492,6 @@ no_journal:
atomic_set(&sbi->s_warning_count, 0);
atomic_set(&sbi->s_msg_count, 0);
- kfree(orig_data);
return 0;
cantfind_ext4:
@@ -5071,7 +5563,7 @@ failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
@@ -5085,14 +5577,62 @@ failed_mount:
ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
- kfree(sbi->s_blockgroup_lock);
-out_free_base:
- kfree(sbi);
- kfree(orig_data);
- fs_put_dax(dax_dev);
return err ? err : ret;
}
+static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi;
+ const char *descr;
+ int ret;
+
+ sbi = ext4_alloc_sbi(sb);
+ if (!sbi)
+ return -ENOMEM;
+
+ fc->s_fs_info = sbi;
+
+ /* Cleanup superblock name */
+ strreplace(sb->s_id, '/', '!');
+
+ sbi->s_sb_block = 1; /* Default super block location */
+ if (ctx->spec & EXT4_SPEC_s_sb_block)
+ sbi->s_sb_block = ctx->s_sb_block;
+
+ ret = __ext4_fill_super(fc, sb);
+ if (ret < 0)
+ goto free_sbi;
+
+ if (sbi->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Quota mode: %s.", descr, ext4_quota_mode(sb));
+
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb);
+ return 0;
+
+free_sbi:
+ ext4_free_sbi(sbi);
+ fc->s_fs_info = NULL;
+ return ret;
+}
+
+static int ext4_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ext4_fill_super);
+}
+
/*
* Setup any per-fs journal parameters now. We'll do this both on
* initial mount, once the journal has been initialised but before we've
@@ -5721,27 +6261,22 @@ struct ext4_mount_options {
#endif
};
-static int ext4_remount(struct super_block *sb, int *flags, char *data)
+static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
+ struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned long old_sb_flags, vfs_flags;
+ unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
- int enable_quota = 0;
ext4_group_t g;
int err = 0;
#ifdef CONFIG_QUOTA
+ int enable_quota = 0;
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
- char *orig_data = kstrdup(data, GFP_KERNEL);
- struct ext4_parsed_options parsed_opts;
- parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- parsed_opts.journal_devnum = 0;
-
- if (data && !orig_data)
- return -ENOMEM;
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
/* Store the original options */
old_sb_flags = sb->s_flags;
@@ -5762,28 +6297,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
- kfree(orig_data);
return -ENOMEM;
}
} else
old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- parsed_opts.journal_ioprio =
+ ctx->journal_ioprio =
sbi->s_journal->j_task->io_context->ioprio;
- /*
- * Some options can be enabled by ext4 and/or by VFS mount flag
- * either way we need to make sure it matches in both *flags and
- * s_flags. Copy those selected flags from *flags to s_flags
- */
- vfs_flags = SB_LAZYTIME | SB_I_VERSION;
- sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
-
- if (!parse_options(data, sb, &parsed_opts, 1)) {
- err = -EINVAL;
- goto restore_opts;
- }
+ ext4_apply_options(fc, sb);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -5821,7 +6344,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
- ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5830,19 +6353,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal) {
ext4_init_journal_params(sb, sbi->s_journal);
- set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
}
/* Flush outstanding errors before changing fs state */
flush_work(&sbi->s_error_work);
- if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
+ if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
err = -EROFS;
goto restore_opts;
}
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
err = sync_filesystem(sb);
if (err < 0)
goto restore_opts;
@@ -5935,7 +6458,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EROFS;
goto restore_opts;
}
+#ifdef CONFIG_QUOTA
enable_quota = 1;
+#endif
}
}
@@ -5988,16 +6513,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
- /*
- * Some options can be enabled by ext4 and/or by VFS mount flag
- * either way we need to make sure it matches in both *flags and
- * s_flags. Copy those selected flags from s_flags to *flags
- */
- *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
-
- ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.",
- orig_data, ext4_quota_mode(sb));
- kfree(orig_data);
return 0;
restore_opts:
@@ -6023,10 +6538,30 @@ restore_opts:
#endif
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
- kfree(orig_data);
return err;
}
+static int ext4_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ int ret;
+
+ fc->s_fs_info = EXT4_SB(sb);
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ret = __ext4_remount(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
+ ext4_quota_mode(sb));
+
+ return 0;
+}
+
#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
kprojid_t projid, struct kstatfs *buf)
@@ -6267,10 +6802,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err) {
- lockdep_set_quota_inode(path->dentry->d_inode,
- I_DATA_SEM_NORMAL);
- } else {
+ if (!err) {
struct inode *inode = d_inode(path->dentry);
handle_t *handle;
@@ -6290,7 +6822,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
ext4_journal_stop(handle);
unlock_inode:
inode_unlock(inode);
+ if (err)
+ dquot_quota_off(sb, type);
}
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
return err;
}
@@ -6353,8 +6890,19 @@ int ext4_enable_quotas(struct super_block *sb)
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
"e2fsck to fix.", type, err);
- for (type--; type >= 0; type--)
+ for (type--; type >= 0; type--) {
+ struct inode *inode;
+
+ inode = sb_dqopt(sb)->files[type];
+ if (inode)
+ inode = igrab(inode);
dquot_quota_off(sb, type);
+ if (inode) {
+ lockdep_set_quota_inode(inode,
+ I_DATA_SEM_NORMAL);
+ iput(inode);
+ }
+ }
return err;
}
@@ -6458,7 +7006,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) {
+ if (!handle) {
ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
@@ -6509,12 +7057,6 @@ out:
}
#endif
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
-}
-
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
@@ -6572,11 +7114,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
}
static struct file_system_type ext4_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext4",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
@@ -6641,6 +7184,7 @@ static int __init ext4_init_fs(void)
out:
unregister_as_ext2();
unregister_as_ext3();
+ ext4_fc_destroy_dentry_cache();
out05:
destroy_inodecache();
out1:
@@ -6667,6 +7211,7 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext2();
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
+ ext4_fc_destroy_dentry_cache();
destroy_inodecache();
ext4_exit_mballoc();
ext4_exit_sysfs();
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 2314f7446592..d233c24ea342 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -63,7 +63,7 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- return snprintf(buf, PAGE_SIZE, "%lu\n",
+ return sysfs_emit(buf, "%lu\n",
(part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
@@ -72,7 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
@@ -130,8 +130,8 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
if (!sbi->s_journal)
- return snprintf(buf, PAGE_SIZE, "<none>\n");
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "<none>\n");
+ return sysfs_emit(buf, "%d\n",
task_pid_vnr(sbi->s_journal->j_task));
}
@@ -245,6 +245,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
+EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -295,6 +296,7 @@ static struct attribute *ext4_attrs[] = {
#endif
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
+ ATTR_LIST(last_trim_minblks),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -307,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize);
EXT4_ATTR_FEATURE(encryption);
EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
EXT4_ATTR_FEATURE(casefold);
#endif
#ifdef CONFIG_FS_VERITY
@@ -315,7 +317,7 @@ EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
-#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
#endif
@@ -327,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
#ifdef CONFIG_FS_VERITY
@@ -335,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = {
#endif
ATTR_LIST(metadata_csum_seed),
ATTR_LIST(fast_commit),
-#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
ATTR_LIST(encrypted_casefold),
#endif
NULL,
@@ -357,7 +359,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
@@ -374,7 +376,7 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
switch (a->attr_id) {
case attr_delayed_allocation_blocks:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(s64) EXT4_C2B(sbi,
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
case attr_session_write_kbytes:
@@ -382,11 +384,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
case attr_lifetime_write_kbytes:
return lifetime_write_kbytes_show(sbi, buf);
case attr_reserved_clusters:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)
atomic64_read(&sbi->s_resv_clusters));
case attr_sra_exceeded_retry_limit:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)
percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
case attr_inode_readahead:
@@ -394,42 +396,42 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
if (!ptr)
return 0;
if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
le32_to_cpup(ptr));
else
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
*((unsigned int *) ptr));
case attr_pointer_ul:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%lu\n",
+ return sysfs_emit(buf, "%lu\n",
*((unsigned long *) ptr));
case attr_pointer_u8:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
*((unsigned char *) ptr));
case attr_pointer_u64:
if (!ptr)
return 0;
if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
le64_to_cpup(ptr));
else
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
*((unsigned long long *) ptr));
case attr_pointer_string:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
+ return sysfs_emit(buf, "%.*s\n", a->attr_size,
(char *) ptr);
case attr_pointer_atomic:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read((atomic_t *) ptr));
case attr_feature:
- return snprintf(buf, PAGE_SIZE, "supported\n");
+ return sysfs_emit(buf, "supported\n");
case attr_first_error_time:
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1e0fc1ed845b..042325349098 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2408,7 +2408,7 @@ retry_inode:
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
cleanup:
brelse(is.iloc.bh);
@@ -2486,7 +2486,7 @@ retry:
if (error == 0)
error = error2;
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL);
return error;
}
@@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
error);
goto cleanup;
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
}
error = 0;
cleanup:
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 7eea3cfd894d..03ef087537c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -7,6 +7,7 @@ config F2FS_FS
select CRYPTO_CRC32
select F2FS_FS_XATTR if FS_ENCRYPTION
select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
+ select FS_IOMAP
select LZ4_COMPRESS if F2FS_FS_LZ4
select LZ4_DECOMPRESS if F2FS_FS_LZ4
select LZ4HC_COMPRESS if F2FS_FS_LZ4HC
@@ -142,3 +143,10 @@ config F2FS_IOSTAT
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.
+
+config F2FS_UNFAIR_RWSEM
+ bool "F2FS unfair rw_semaphore"
+ depends on F2FS_FS && BLK_CGROUP
+ help
+ Use unfair rw_semaphore, if system configured IO priority by block
+ cgroup.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 16e826e01f09..eaa240b21f07 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
return __f2fs_get_acl(inode, type, NULL);
}
-static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
- struct posix_acl **acl)
+static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
+ struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
{
umode_t mode = inode->i_mode;
int error;
@@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
}
-static int __f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
int name_index;
@@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = f2fs_acl_update_mode(inode, &mode, &acl);
+ error = f2fs_acl_update_mode(mnt_userns, inode,
+ &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
- return __f2fs_set_acl(inode, type, acl, NULL);
+ return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL);
}
/*
@@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
- error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
} else {
@@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
}
if (acl) {
if (!error)
- error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
} else {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 83e9bc0f91ff..909085a78f9c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,6 +98,13 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
+ if (page->index == sbi->metapage_eio_ofs) {
+ if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->metapage_eio_ofs = page->index;
+ sbi->metapage_eio_cnt = 0;
+ }
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -282,18 +289,22 @@ out:
return blkno - start;
}
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks)
{
struct page *page;
bool readahead = false;
+ if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
+ return;
+
page = find_get_page(META_MAPPING(sbi), index);
if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
if (readahead)
- f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
@@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!down_write_trylock(&sbi->cp_global_sem))
+ if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -436,26 +447,27 @@ stop:
return nwritten;
}
-static int f2fs_set_meta_page_dirty(struct page *page)
+static bool f2fs_dirty_meta_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, META);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- set_page_private_reference(page);
- return 1;
+ trace_f2fs_set_page_dirty(&folio->page, META);
+
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
const struct address_space_operations f2fs_meta_aops = {
.writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
- .set_page_dirty = f2fs_set_meta_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_meta_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -653,7 +665,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
return PTR_ERR(inode);
}
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err) {
iput(inode);
goto err_out;
@@ -664,7 +676,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
/* truncate all the data during iput */
iput(inode);
- err = f2fs_get_node_info(sbi, ino, &ni);
+ err = f2fs_get_node_info(sbi, ino, &ni, false);
if (err)
goto err_out;
@@ -705,9 +717,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= SB_ACTIVE;
-
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
@@ -867,6 +876,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -874,15 +884,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1017,7 +1028,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
-void f2fs_update_dirty_page(struct inode *inode, struct page *page)
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
@@ -1032,7 +1043,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- set_page_private_reference(page);
+ set_page_private_reference(&folio->page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@@ -1162,7 +1173,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- down_write(&sbi->quota_sem);
+ if (!f2fs_down_write_trylock(&sbi->quota_sem))
+ return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
@@ -1173,7 +1185,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
ret = true;
}
- up_write(&sbi->quota_sem);
+ f2fs_up_write(&sbi->quota_sem);
return ret;
}
@@ -1230,10 +1242,10 @@ retry_flush_dents:
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush. inode->i_blocks can be updated.
*/
- down_write(&sbi->node_change);
+ f2fs_down_write(&sbi->node_change);
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
@@ -1243,15 +1255,15 @@ retry_flush_dents:
}
retry_flush_nodes:
- down_write(&sbi->node_write);
+ f2fs_down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
atomic_inc(&sbi->wb_sync_req[NODE]);
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
return err;
}
@@ -1264,13 +1276,13 @@ retry_flush_nodes:
* dirty node blocks and some checkpoint values by block allocation.
*/
__prepare_cp_block(sbi);
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
f2fs_unlock_all(sbi);
}
@@ -1304,8 +1316,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long flags;
if (cpc->reason & CP_UMOUNT) {
- if (le32_to_cpu(ckpt->cp_pack_total_block_count) >
- sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) {
+ if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
+ NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Disable nat_bits due to no space");
} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
@@ -1545,6 +1557,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
+ percpu_counter_set(&sbi->rf_node_block_count, 0);
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@@ -1614,7 +1627,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1695,7 +1708,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
return err;
}
@@ -1743,9 +1756,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
struct cp_control cpc = { .reason = CP_SYNC, };
int err;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return err;
}
@@ -1833,9 +1846,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
int ret;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
ret = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c1bf9ad4c220..12a56f9e1572 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/lzo.h>
@@ -153,6 +154,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse)
cc->rpages = NULL;
cc->nr_rpages = 0;
cc->nr_cpages = 0;
+ cc->valid_nr_cpages = 0;
if (!reuse)
cc->cluster_idx = NULL_CLUSTER;
}
@@ -312,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
"expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
+ F2FS_I_SB(dic->inode)->sb->s_id, ret,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
@@ -335,8 +336,8 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
static int zstd_init_compress_ctx(struct compress_ctx *cc)
{
- ZSTD_parameters params;
- ZSTD_CStream *stream;
+ zstd_parameters params;
+ zstd_cstream *stream;
void *workspace;
unsigned int workspace_size;
unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
@@ -345,17 +346,17 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
if (!level)
level = F2FS_ZSTD_DEFAULT_CLEVEL;
- params = ZSTD_getParams(level, cc->rlen, 0);
- workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
+ params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+ workspace_size = zstd_cstream_workspace_bound(&params.cParams);
workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
workspace_size, GFP_NOFS);
if (!workspace)
return -ENOMEM;
- stream = ZSTD_initCStream(params, 0, workspace, workspace_size);
+ stream = zstd_init_cstream(&params, 0, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n",
+ printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n",
KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
__func__);
kvfree(workspace);
@@ -378,9 +379,9 @@ static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
static int zstd_compress_pages(struct compress_ctx *cc)
{
- ZSTD_CStream *stream = cc->private2;
- ZSTD_inBuffer inbuf;
- ZSTD_outBuffer outbuf;
+ zstd_cstream *stream = cc->private2;
+ zstd_in_buffer inbuf;
+ zstd_out_buffer outbuf;
int src_size = cc->rlen;
int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE;
int ret;
@@ -393,19 +394,19 @@ static int zstd_compress_pages(struct compress_ctx *cc)
outbuf.dst = cc->cbuf->cdata;
outbuf.size = dst_size;
- ret = ZSTD_compressStream(stream, &outbuf, &inbuf);
- if (ZSTD_isError(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ ret = zstd_compress_stream(stream, &outbuf, &inbuf);
+ if (zstd_is_error(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n",
KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
- __func__, ZSTD_getErrorCode(ret));
+ __func__, zstd_get_error_code(ret));
return -EIO;
}
- ret = ZSTD_endStream(stream, &outbuf);
- if (ZSTD_isError(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n",
+ ret = zstd_end_stream(stream, &outbuf);
+ if (zstd_is_error(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n",
KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
- __func__, ZSTD_getErrorCode(ret));
+ __func__, zstd_get_error_code(ret));
return -EIO;
}
@@ -422,22 +423,22 @@ static int zstd_compress_pages(struct compress_ctx *cc)
static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
{
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
void *workspace;
unsigned int workspace_size;
unsigned int max_window_size =
MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size);
- workspace_size = ZSTD_DStreamWorkspaceBound(max_window_size);
+ workspace_size = zstd_dstream_workspace_bound(max_window_size);
workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
workspace_size, GFP_NOFS);
if (!workspace)
return -ENOMEM;
- stream = ZSTD_initDStream(max_window_size, workspace, workspace_size);
+ stream = zstd_init_dstream(max_window_size, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
+ printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n",
KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
__func__);
kvfree(workspace);
@@ -459,9 +460,9 @@ static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
static int zstd_decompress_pages(struct decompress_io_ctx *dic)
{
- ZSTD_DStream *stream = dic->private2;
- ZSTD_inBuffer inbuf;
- ZSTD_outBuffer outbuf;
+ zstd_dstream *stream = dic->private2;
+ zstd_in_buffer inbuf;
+ zstd_out_buffer outbuf;
int ret;
inbuf.pos = 0;
@@ -472,11 +473,11 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic)
outbuf.dst = dic->rbuf;
outbuf.size = dic->rlen;
- ret = ZSTD_decompressStream(stream, &outbuf, &inbuf);
- if (ZSTD_isError(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ ret = zstd_decompress_stream(stream, &outbuf, &inbuf);
+ if (zstd_is_error(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n",
KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
- __func__, ZSTD_getErrorCode(ret));
+ __func__, zstd_get_error_code(ret));
return -EIO;
}
@@ -619,7 +620,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
unsigned int max_len, new_nr_cpages;
- struct page **new_cpages;
u32 chksum = 0;
int i, ret;
@@ -634,6 +634,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
max_len = COMPRESS_HEADER_SIZE + cc->clen;
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
+ cc->valid_nr_cpages = cc->nr_cpages;
cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages);
if (!cc->cpages) {
@@ -684,13 +685,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
- /* Now we're going to cut unnecessary tail pages */
- new_cpages = page_array_alloc(cc->inode, new_nr_cpages);
- if (!new_cpages) {
- ret = -ENOMEM;
- goto out_vunmap_cbuf;
- }
-
/* zero out any unused part of the last page */
memset(&cc->cbuf->cdata[cc->clen], 0,
(new_nr_cpages * PAGE_SIZE) -
@@ -700,10 +694,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
vm_unmap_ram(cc->rbuf, cc->cluster_size);
for (i = 0; i < cc->nr_cpages; i++) {
- if (i < new_nr_cpages) {
- new_cpages[i] = cc->cpages[i];
+ if (i < new_nr_cpages)
continue;
- }
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
@@ -711,9 +703,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
if (cops->destroy_compress_ctx)
cops->destroy_compress_ctx(cc);
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
- cc->cpages = new_cpages;
- cc->nr_cpages = new_nr_cpages;
+ cc->valid_nr_cpages = new_nr_cpages;
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
cc->clen, ret);
@@ -881,6 +871,25 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
return is_page_in_cluster(cc, index);
}
+bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
+ int index, int nr_pages)
+{
+ unsigned long pgidx;
+ int i;
+
+ if (nr_pages - index < cc->cluster_size)
+ return false;
+
+ pgidx = pvec->pages[index]->index;
+
+ for (i = 1; i < cc->cluster_size; i++) {
+ if (pvec->pages[index + i]->index != pgidx + i)
+ return false;
+ }
+
+ return true;
+}
+
static bool cluster_has_invalid_data(struct compress_ctx *cc)
{
loff_t i_size = i_size_read(cc->inode);
@@ -1257,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
} else if (!f2fs_trylock_op(sbi)) {
goto out_free;
}
@@ -1276,7 +1285,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT;
- err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
+ err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
if (err)
goto out_put_dnode;
@@ -1288,14 +1297,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
- atomic_set(&cic->pending_pages, cc->nr_cpages);
+ atomic_set(&cic->pending_pages, cc->valid_nr_cpages);
cic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
if (!cic->rpages)
goto out_put_cic;
cic->nr_rpages = cc->cluster_size;
- for (i = 0; i < cc->nr_cpages; i++) {
+ for (i = 0; i < cc->valid_nr_cpages; i++) {
f2fs_set_compressed_page(cc->cpages[i], inode,
cc->rpages[i + 1]->index, cic);
fio.compressed_page = cc->cpages[i];
@@ -1340,7 +1349,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr))
fio.compr_blocks++;
- if (i > cc->nr_cpages) {
+ if (i > cc->valid_nr_cpages) {
if (__is_valid_data_blkaddr(blkaddr)) {
f2fs_invalidate_blocks(sbi, blkaddr);
f2fs_update_data_blkaddr(&dn, NEW_ADDR);
@@ -1365,8 +1374,8 @@ unlock_continue:
if (fio.compr_blocks)
f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false);
- f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true);
- add_compr_block_stat(inode, cc->nr_cpages);
+ f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true);
+ add_compr_block_stat(inode, cc->valid_nr_cpages);
set_inode_flag(cc->inode, FI_APPEND_WRITE);
if (cc->cluster_idx == 0)
@@ -1374,7 +1383,7 @@ unlock_continue:
f2fs_put_dnode(&dn);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
@@ -1400,13 +1409,11 @@ out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
out_free:
- for (i = 0; i < cc->nr_cpages; i++) {
- if (!cc->cpages[i])
- continue;
+ for (i = 0; i < cc->valid_nr_cpages; i++) {
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
@@ -1448,25 +1455,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
enum iostat_type io_type)
{
struct address_space *mapping = cc->inode->i_mapping;
- int _submitted, compr_blocks, ret;
- int i = -1, err = 0;
+ int _submitted, compr_blocks, ret, i;
compr_blocks = f2fs_compressed_blocks(cc);
- if (compr_blocks < 0) {
- err = compr_blocks;
- goto out_err;
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (!cc->rpages[i])
+ continue;
+
+ redirty_page_for_writepage(wbc, cc->rpages[i]);
+ unlock_page(cc->rpages[i]);
}
+ if (compr_blocks < 0)
+ return compr_blocks;
+
for (i = 0; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
continue;
retry_write:
+ lock_page(cc->rpages[i]);
+
if (cc->rpages[i]->mapping != mapping) {
+continue_unlock:
unlock_page(cc->rpages[i]);
continue;
}
- BUG_ON(!PageLocked(cc->rpages[i]));
+ if (!PageDirty(cc->rpages[i]))
+ goto continue_unlock;
+
+ if (!clear_page_dirty_for_io(cc->rpages[i]))
+ goto continue_unlock;
ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
NULL, NULL, wbc, io_type,
@@ -1481,26 +1501,13 @@ retry_write:
* avoid deadlock caused by cluster update race
* from foreground operation.
*/
- if (IS_NOQUOTA(cc->inode)) {
- err = 0;
- goto out_err;
- }
+ if (IS_NOQUOTA(cc->inode))
+ return 0;
ret = 0;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
- lock_page(cc->rpages[i]);
-
- if (!PageDirty(cc->rpages[i])) {
- unlock_page(cc->rpages[i]);
- continue;
- }
-
- clear_page_dirty_for_io(cc->rpages[i]);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry_write;
}
- err = ret;
- goto out_err;
+ return ret;
}
*submitted += _submitted;
@@ -1509,14 +1516,6 @@ retry_write:
f2fs_balance_fs(F2FS_M_SB(mapping), true);
return 0;
-out_err:
- for (++i; i < cc->cluster_size; i++) {
- if (!cc->rpages[i])
- continue;
- redirty_page_for_writepage(wbc, cc->rpages[i]);
- unlock_page(cc->rpages[i]);
- }
- return err;
}
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -1530,6 +1529,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
if (cluster_may_compress(cc)) {
err = f2fs_compress_pages(cc);
if (err == -EAGAIN) {
+ add_compr_block_stat(cc->inode, cc->cluster_size);
goto write;
} else if (err) {
f2fs_put_rpages_wbc(cc, wbc, true, 1);
@@ -1747,7 +1747,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
const struct address_space_operations f2fs_compress_aops = {
.releasepage = f2fs_release_page,
- .invalidatepage = f2fs_invalidate_page,
+ .invalidate_folio = f2fs_invalidate_folio,
};
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f4fd6c246c9a..9a1a526f2092 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -8,9 +8,9 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
+#include <linux/sched/mm.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
-#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
@@ -18,9 +18,9 @@
#include <linux/swap.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
-#include <linux/cleancache.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include "f2fs.h"
#include "node.h"
@@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work)
bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS);
/*
- * fsverity_verify_bio() may call readpages() again, and while verity
+ * fsverity_verify_bio() may call readahead() again, and while verity
* will be disabled for this, decryption and/or decompression may still
* be needed, resulting in another bio_post_read_ctx being allocated.
* So to prevent deadlocks we need to release the current ctx to the
@@ -354,7 +354,7 @@ static void f2fs_write_end_io(struct bio *bio)
}
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio)
+ block_t blk_addr, sector_t *sector)
{
struct block_device *bdev = sbi->sb->s_bdev;
int i;
@@ -369,10 +369,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
}
}
}
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
- }
+
+ if (sector)
+ *sector = SECTOR_FROM_BLOCK(blk_addr);
return bdev;
}
@@ -389,22 +388,55 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+static unsigned int f2fs_io_flags(struct f2fs_io_info *fio)
+{
+ unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
+ unsigned int fua_flag, meta_flag, io_flag;
+ unsigned int op_flags = 0;
+
+ if (fio->op != REQ_OP_WRITE)
+ return 0;
+ if (fio->type == DATA)
+ io_flag = fio->sbi->data_io_flag;
+ else if (fio->type == NODE)
+ io_flag = fio->sbi->node_io_flag;
+ else
+ return 0;
+
+ fua_flag = io_flag & temp_mask;
+ meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+
+ /*
+ * data/node io flag bits per temp:
+ * REQ_META | REQ_FUA |
+ * 5 | 4 | 3 | 2 | 1 | 0 |
+ * Cold | Warm | Hot | Cold | Warm | Hot |
+ */
+ if ((1 << fio->temp) & meta_flag)
+ op_flags |= REQ_META;
+ if ((1 << fio->temp) & fua_flag)
+ op_flags |= REQ_FUA;
+ return op_flags;
+}
+
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
+ struct block_device *bdev;
+ sector_t sector;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
-
- f2fs_target_device(sbi, fio->new_blkaddr, bio);
+ bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector);
+ bio = bio_alloc_bioset(bdev, npages,
+ fio->op | fio->op_flags | f2fs_io_flags(fio),
+ GFP_NOIO, &f2fs_bioset);
+ bio->bi_iter.bi_sector = sector;
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = NULL;
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi;
- bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
- fio->type, fio->temp);
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
@@ -500,34 +532,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi,
__submit_bio(sbi, bio, type);
}
-static void __attach_io_flag(struct f2fs_io_info *fio)
-{
- struct f2fs_sb_info *sbi = fio->sbi;
- unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
- unsigned int io_flag, fua_flag, meta_flag;
-
- if (fio->type == DATA)
- io_flag = sbi->data_io_flag;
- else if (fio->type == NODE)
- io_flag = sbi->node_io_flag;
- else
- return;
-
- fua_flag = io_flag & temp_mask;
- meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
-
- /*
- * data/node io flag bits per temp:
- * REQ_META | REQ_FUA |
- * 5 | 4 | 3 | 2 | 1 | 0 |
- * Cold | Warm | Hot | Cold | Warm | Hot |
- */
- if ((1 << fio->temp) & meta_flag)
- fio->op_flags |= REQ_META;
- if ((1 << fio->temp) & fua_flag)
- fio->op_flags |= REQ_FUA;
-}
-
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -535,9 +539,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
- __attach_io_flag(fio);
- bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
-
if (is_read_io(fio->op))
trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
else
@@ -590,18 +591,17 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
- io->fio.op = REQ_OP_WRITE;
- io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
+ io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC;
if (!test_opt(sbi, NOBARRIER))
- io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+ io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -616,9 +616,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_read(&io->io_rwsem);
+ f2fs_down_read(&io->io_rwsem);
ret = __has_merged_page(io->bio, inode, page, ino);
- up_read(&io->io_rwsem);
+ f2fs_up_read(&io->io_rwsem);
}
if (ret)
__f2fs_submit_merged_write(sbi, type, temp);
@@ -679,9 +679,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (fio->io_wbc && !is_read_io(fio->op))
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
- __attach_io_flag(fio);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
-
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page): WB_DATA_TYPE(fio->page));
@@ -742,9 +739,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
f2fs_bug_on(sbi, 1);
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_add_tail(&be->list, &io->bio_list);
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
static void del_bio_entry(struct bio_entry *be)
@@ -766,7 +763,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct list_head *head = &io->bio_list;
struct bio_entry *be;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (be->bio != *bio)
continue;
@@ -790,7 +787,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
__submit_bio(sbi, *bio, DATA);
break;
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (ret) {
@@ -816,7 +813,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (list_empty(head))
continue;
- down_read(&io->bio_list_lock);
+ f2fs_down_read(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -826,14 +823,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (found)
break;
}
- up_read(&io->bio_list_lock);
+ f2fs_up_read(&io->bio_list_lock);
if (!found)
continue;
found = false;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -846,7 +843,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
break;
}
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (found)
@@ -875,10 +872,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
- __attach_io_flag(fio);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
fio->page->index, fio, GFP_NOIO);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
@@ -906,7 +901,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
f2fs_bug_on(sbi, is_read_io(fio->op));
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
next:
if (fio->in_list) {
spin_lock(&io->io_lock);
@@ -973,7 +968,7 @@ out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
@@ -984,17 +979,17 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
struct bio *bio;
struct bio_post_read_ctx *ctx = NULL;
unsigned int post_read_steps = 0;
+ sector_t sector;
+ struct block_device *bdev = f2fs_target_device(sbi, blkaddr, &sector);
- bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- bio_max_segs(nr_pages), &f2fs_bioset);
+ bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ | op_flag,
+ for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
-
+ bio->bi_iter.bi_sector = sector;
f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
-
- f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= STEP_DECRYPT;
@@ -1354,7 +1349,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- err = f2fs_get_node_info(sbi, dn->nid, &ni);
+ err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
if (err)
return err;
@@ -1376,68 +1371,16 @@ alloc:
f2fs_invalidate_compress_page(sbi, old_blkaddr);
}
f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
-
- /*
- * i_size will be updated by direct_IO. Otherwise, we'll get stale
- * data from unwritten block via dio_read.
- */
return 0;
}
-int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct f2fs_map_blocks map;
- int flag;
- int err = 0;
- bool direct_io = iocb->ki_flags & IOCB_DIRECT;
-
- map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
- map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
- if (map.m_len > map.m_lblk)
- map.m_len -= map.m_lblk;
- else
- map.m_len = 0;
-
- map.m_next_pgofs = NULL;
- map.m_next_extent = NULL;
- map.m_seg_type = NO_CHECK_TYPE;
- map.m_may_create = true;
-
- if (direct_io) {
- map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
- flag = f2fs_force_buffered_io(inode, iocb, from) ?
- F2FS_GET_BLOCK_PRE_AIO :
- F2FS_GET_BLOCK_PRE_DIO;
- goto map_blocks;
- }
- if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
- if (f2fs_has_inline_data(inode))
- return err;
-
- flag = F2FS_GET_BLOCK_PRE_AIO;
-
-map_blocks:
- err = f2fs_map_blocks(inode, &map, 1, flag);
- if (map.m_len > 0 && err == -ENOSPC) {
- if (!direct_io)
- set_inode_flag(inode, FI_NO_PREALLOC);
- err = 0;
- }
- return err;
-}
-
void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
- down_read(&sbi->node_change);
+ f2fs_down_read(&sbi->node_change);
else
- up_read(&sbi->node_change);
+ f2fs_up_read(&sbi->node_change);
} else {
if (lock)
f2fs_lock_op(sbi);
@@ -1465,10 +1408,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
struct extent_info ei = {0, };
block_t blkaddr;
unsigned int start_pgofs;
+ int bidx = 0;
if (!maxblocks)
return 0;
+ map->m_bdev = inode->i_sb->s_bdev;
+ map->m_multidev_dio =
+ f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag);
+
map->m_len = 0;
map->m_flags = 0;
@@ -1491,6 +1439,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
if (flag == F2FS_GET_BLOCK_DIO)
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
+
+ if (map->m_multidev_dio) {
+ block_t blk_addr = map->m_pblk;
+
+ bidx = f2fs_target_device_index(sbi, map->m_pblk);
+
+ map->m_bdev = FDEV(bidx).bdev;
+ map->m_pblk -= FDEV(bidx).start_blk;
+ map->m_len = min(map->m_len,
+ FDEV(bidx).end_blk + 1 - map->m_pblk);
+
+ if (map->m_may_create)
+ f2fs_update_device_state(sbi, inode->i_ino,
+ blk_addr, map->m_len);
+ }
goto out;
}
@@ -1570,8 +1533,11 @@ next_block:
flag != F2FS_GET_BLOCK_DIO);
err = __allocate_data_block(&dn,
map->m_seg_type);
- if (!err)
+ if (!err) {
+ if (flag == F2FS_GET_BLOCK_PRE_DIO)
+ file_need_truncate(inode);
set_inode_flag(inode, FI_APPEND_WRITE);
+ }
}
if (err)
goto sync_out;
@@ -1609,6 +1575,9 @@ next_block:
if (flag == F2FS_GET_BLOCK_PRE_AIO)
goto skip;
+ if (map->m_multidev_dio)
+ bidx = f2fs_target_device_index(sbi, blkaddr);
+
if (map->m_len == 0) {
/* preallocated unwritten block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
@@ -1617,10 +1586,15 @@ next_block:
map->m_pblk = blkaddr;
map->m_len = 1;
+
+ if (map->m_multidev_dio)
+ map->m_bdev = FDEV(bidx).bdev;
} else if ((map->m_pblk != NEW_ADDR &&
blkaddr == (map->m_pblk + ofs)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
flag == F2FS_GET_BLOCK_PRE_DIO) {
+ if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
+ goto sync_out;
ofs++;
map->m_len++;
} else {
@@ -1673,10 +1647,32 @@ skip:
sync_out:
- /* for hardware encryption, but to avoid potential issue in future */
- if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+ if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) {
+ /*
+ * for hardware encryption, but to avoid potential issue
+ * in future
+ */
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ map->m_pblk, map->m_pblk);
+
+ if (map->m_multidev_dio) {
+ block_t blk_addr = map->m_pblk;
+
+ bidx = f2fs_target_device_index(sbi, map->m_pblk);
+
+ map->m_bdev = FDEV(bidx).bdev;
+ map->m_pblk -= FDEV(bidx).start_blk;
+
+ if (map->m_may_create)
+ f2fs_update_device_state(sbi, inode->i_ino,
+ blk_addr, map->m_len);
+
+ f2fs_bug_on(sbi, blk_addr + map->m_len >
+ FDEV(bidx).end_blk + 1);
+ }
+ }
if (flag == F2FS_GET_BLOCK_PRECACHE) {
if (map->m_flags & F2FS_MAP_MAPPED) {
@@ -1696,7 +1692,7 @@ unlock_out:
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
- trace_f2fs_map_blocks(inode, map, err);
+ trace_f2fs_map_blocks(inode, map, create, flag, err);
return err;
}
@@ -1736,47 +1732,6 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks)
return (blks << inode->i_blkbits);
}
-static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create, int flag,
- pgoff_t *next_pgofs, int seg_type, bool may_write)
-{
- struct f2fs_map_blocks map;
- int err;
-
- map.m_lblk = iblock;
- map.m_len = bytes_to_blks(inode, bh->b_size);
- map.m_next_pgofs = next_pgofs;
- map.m_next_extent = NULL;
- map.m_seg_type = seg_type;
- map.m_may_create = may_write;
-
- err = f2fs_map_blocks(inode, &map, create, flag);
- if (!err) {
- map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
- bh->b_size = blks_to_bytes(inode, map.m_len);
- }
- return err;
-}
-
-static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO, NULL,
- f2fs_rw_hint_to_seg_type(inode->i_write_hint),
- true);
-}
-
-static int get_data_block_dio(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO, NULL,
- f2fs_rw_hint_to_seg_type(inode->i_write_hint),
- false);
-}
-
static int f2fs_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
@@ -1796,7 +1751,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
if (!page)
return -ENOMEM;
- err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
+ err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
f2fs_put_page(page, 1);
return err;
@@ -1828,7 +1783,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
if (!page)
return -ENOMEM;
- err = f2fs_get_node_info(sbi, xnid, &ni);
+ err = f2fs_get_node_info(sbi, xnid, &ni, false);
if (err) {
f2fs_put_page(page, 1);
return err;
@@ -2074,12 +2029,6 @@ got_it:
block_nr = map->m_pblk + block_in_file - map->m_lblk;
SetPageMappedToDisk(page);
- if (!PageUptodate(page) && (!PageSwapCache(page) &&
- !cleancache_get_page(page))) {
- SetPageUptodate(page);
- goto confused;
- }
-
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
ret = -EFSCORRUPTED;
@@ -2135,12 +2084,6 @@ submit_and_realloc:
ClearPageError(page);
*last_block_in_bio = block_nr;
goto out;
-confused:
- if (bio) {
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
- bio = NULL;
- }
- unlock_page(page);
out:
*bio_ret = bio;
return ret;
@@ -2458,7 +2401,7 @@ static void f2fs_readahead(struct readahead_control *rac)
if (!f2fs_is_compress_backend_ready(inode))
return;
- /* If the file has inline data, skip readpages */
+ /* If the file has inline data, skip readahead */
if (f2fs_has_inline_data(inode))
return;
@@ -2489,7 +2432,7 @@ retry_encrypt:
/* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
f2fs_flush_merged_writes(fio->sbi);
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
@@ -2512,6 +2455,9 @@ static inline bool check_inplace_update_policy(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
+ if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) &&
+ is_inode_flag_set(inode, FI_OPU_WRITE))
+ return false;
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
@@ -2564,6 +2510,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ /* The below cases were checked when setting it. */
+ if (f2fs_is_pinned_file(inode))
+ return false;
+ if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+ return true;
if (f2fs_lfs_mode(sbi))
return true;
if (S_ISDIR(inode->i_mode))
@@ -2572,13 +2523,14 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (f2fs_is_atomic_file(inode))
return true;
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
- return true;
/* swap file is migrating in aligned write mode */
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return true;
+ if (is_inode_flag_set(inode, FI_OPU_WRITE))
+ return true;
+
if (fio) {
if (page_private_gcing(fio->page))
return true;
@@ -2685,7 +2637,7 @@ got_it:
fio->need_lock = LOCK_REQ;
}
- err = f2fs_get_node_info(fio->sbi, dn.nid, &ni);
+ err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false);
if (err)
goto out_writepage;
@@ -2798,13 +2750,13 @@ write:
* the below discard race condition.
*/
if (IS_NOQUOTA(inode))
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto done;
}
@@ -2934,6 +2886,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
.rpages = NULL,
.nr_rpages = 0,
.cpages = NULL,
+ .valid_nr_cpages = 0,
.rbuf = NULL,
.cbuf = NULL,
.rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size,
@@ -2989,6 +2942,10 @@ readd:
need_readd = false;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
+ void *fsdata = NULL;
+ struct page *pagep;
+ int ret2;
+
ret = f2fs_init_compress_ctx(&cc);
if (ret) {
done = 1;
@@ -3007,27 +2964,23 @@ readd:
if (unlikely(f2fs_cp_error(sbi)))
goto lock_page;
- if (f2fs_cluster_is_empty(&cc)) {
- void *fsdata = NULL;
- struct page *pagep;
- int ret2;
+ if (!f2fs_cluster_is_empty(&cc))
+ goto lock_page;
- ret2 = f2fs_prepare_compress_overwrite(
+ ret2 = f2fs_prepare_compress_overwrite(
inode, &pagep,
page->index, &fsdata);
- if (ret2 < 0) {
- ret = ret2;
- done = 1;
- break;
- } else if (ret2 &&
- !f2fs_compress_write_end(inode,
- fsdata, page->index,
- 1)) {
- retry = 1;
- break;
- }
- } else {
- goto lock_page;
+ if (ret2 < 0) {
+ ret = ret2;
+ done = 1;
+ break;
+ } else if (ret2 &&
+ (!f2fs_compress_write_end(inode,
+ fsdata, page->index, 1) ||
+ !f2fs_all_cluster_page_loaded(&cc,
+ &pvec, i, nr_pages))) {
+ retry = 1;
+ break;
}
}
#endif
@@ -3095,8 +3048,7 @@ result:
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
+ f2fs_io_schedule_timeout(
DEFAULT_IO_TIMEOUT);
goto retry_write;
}
@@ -3202,8 +3154,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
- /* skip writing during file defragment */
- if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+ /* skip writing in file defragment preparing stage */
+ if (is_inode_flag_set(inode, FI_SKIP_WRITES))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, DATA);
@@ -3211,8 +3163,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
@@ -3252,7 +3208,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
FS_CP_DATA_IO : FS_DATA_IO);
}
-static void f2fs_write_failed(struct inode *inode, loff_t to)
+void f2fs_write_failed(struct inode *inode, loff_t to)
{
loff_t i_size = i_size_read(inode);
@@ -3261,14 +3217,14 @@ static void f2fs_write_failed(struct inode *inode, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3286,12 +3242,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
int flag;
/*
- * we already allocated all the blocks, so we don't need to get
- * the block addresses when there is no need to fill the page.
+ * If a whole page is being written and we already preallocated all the
+ * blocks, then there is no need to get a block address now.
*/
- if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
- !is_inode_flag_set(inode, FI_NO_PREALLOC) &&
- !f2fs_verity_in_progress(inode))
+ if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL))
return 0;
/* f2fs_lock_op avoids race between write CP and convert_inline_page */
@@ -3403,7 +3357,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
*fsdata = NULL;
- if (len == PAGE_SIZE)
+ if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
goto repeat;
ret = f2fs_prepare_compress_overwrite(inode, pagep,
@@ -3542,169 +3496,16 @@ unlock_out:
return copied;
}
-static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
- loff_t offset)
-{
- unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
- unsigned blkbits = i_blkbits;
- unsigned blocksize_mask = (1 << blkbits) - 1;
- unsigned long align = offset | iov_iter_alignment(iter);
- struct block_device *bdev = inode->i_sb->s_bdev;
-
- if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode))
- return 1;
-
- if (align & blocksize_mask) {
- if (bdev)
- blkbits = blksize_bits(bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if (align & blocksize_mask)
- return -EINVAL;
- return 1;
- }
- return 0;
-}
-
-static void f2fs_dio_end_io(struct bio *bio)
-{
- struct f2fs_private_dio *dio = bio->bi_private;
-
- dec_page_count(F2FS_I_SB(dio->inode),
- dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
-
- bio->bi_private = dio->orig_private;
- bio->bi_end_io = dio->orig_end_io;
-
- kfree(dio);
-
- bio_endio(bio);
-}
-
-static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
- loff_t file_offset)
-{
- struct f2fs_private_dio *dio;
- bool write = (bio_op(bio) == REQ_OP_WRITE);
-
- dio = f2fs_kzalloc(F2FS_I_SB(inode),
- sizeof(struct f2fs_private_dio), GFP_NOFS);
- if (!dio)
- goto out;
-
- dio->inode = inode;
- dio->orig_end_io = bio->bi_end_io;
- dio->orig_private = bio->bi_private;
- dio->write = write;
-
- bio->bi_end_io = f2fs_dio_end_io;
- bio->bi_private = dio;
-
- inc_page_count(F2FS_I_SB(inode),
- write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
-
- submit_bio(bio);
- return;
-out:
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
-}
-
-static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
- size_t count = iov_iter_count(iter);
- loff_t offset = iocb->ki_pos;
- int rw = iov_iter_rw(iter);
- int err;
- enum rw_hint hint = iocb->ki_hint;
- int whint_mode = F2FS_OPTION(sbi).whint_mode;
- bool do_opu;
-
- err = check_direct_IO(inode, iter, offset);
- if (err)
- return err < 0 ? err : 0;
-
- if (f2fs_force_buffered_io(inode, iocb, iter))
- return 0;
-
- do_opu = rw == WRITE && f2fs_lfs_mode(sbi);
-
- trace_f2fs_direct_IO_enter(inode, offset, count, rw);
-
- if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = WRITE_LIFE_NOT_SET;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
- iocb->ki_hint = hint;
- err = -EAGAIN;
- goto out;
- }
- if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
- up_read(&fi->i_gc_rwsem[rw]);
- iocb->ki_hint = hint;
- err = -EAGAIN;
- goto out;
- }
- } else {
- down_read(&fi->i_gc_rwsem[rw]);
- if (do_opu)
- down_read(&fi->i_gc_rwsem[READ]);
- }
-
- err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, rw == WRITE ? get_data_block_dio_write :
- get_data_block_dio, NULL, f2fs_dio_submit_bio,
- rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
- DIO_SKIP_HOLES);
-
- if (do_opu)
- up_read(&fi->i_gc_rwsem[READ]);
-
- up_read(&fi->i_gc_rwsem[rw]);
-
- if (rw == WRITE) {
- if (whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = hint;
- if (err > 0) {
- f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
- err);
- if (!do_opu)
- set_inode_flag(inode, FI_UPDATE_WRITE);
- } else if (err == -EIOCBQUEUED) {
- f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
- count - iov_iter_count(iter));
- } else if (err < 0) {
- f2fs_write_failed(inode, offset + count);
- }
- } else {
- if (err > 0)
- f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err);
- else if (err == -EIOCBQUEUED)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO,
- count - iov_iter_count(iter));
- }
-
-out:
- trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
-
- return err;
-}
-
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
- (offset % PAGE_SIZE || length != PAGE_SIZE))
+ (offset || length != folio_size(folio)))
return;
- if (PageDirty(page)) {
+ if (folio_test_dirty(folio)) {
if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
@@ -3715,20 +3516,16 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
- if (test_opt(sbi, COMPRESS_CACHE)) {
- if (f2fs_compressed_file(inode))
- f2fs_invalidate_compress_pages(sbi, inode->i_ino);
- if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
- clear_page_private_data(page);
- }
+ if (test_opt(sbi, COMPRESS_CACHE) &&
+ inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ clear_page_private_data(&folio->page);
- if (page_private_atomic(page))
- return f2fs_drop_inmem_page(inode, page);
+ if (page_private_atomic(&folio->page))
+ return f2fs_drop_inmem_page(inode, &folio->page);
- detach_page_private(page);
- set_page_private(page, 0);
+ folio_detach_private(folio);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -3742,12 +3539,9 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 0;
if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct inode *inode = page->mapping->host;
- if (f2fs_compressed_file(inode))
- f2fs_invalidate_compress_pages(sbi, inode->i_ino);
- if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode)))
clear_page_private_data(page);
}
@@ -3758,35 +3552,35 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 1;
}
-static int f2fs_set_data_page_dirty(struct page *page)
+static bool f2fs_dirty_data_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = mapping->host;
- trace_f2fs_set_page_dirty(page, DATA);
+ trace_f2fs_set_page_dirty(&folio->page, DATA);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (PageSwapCache(page))
- return __set_page_dirty_nobuffers(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ BUG_ON(folio_test_swapcache(folio));
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!page_private_atomic(page)) {
- f2fs_register_inmem_page(inode, page);
- return 1;
+ if (!page_private_atomic(&folio->page)) {
+ f2fs_register_inmem_page(inode, &folio->page);
+ return true;
}
/*
* Previously, this page has been registered, we just
* return here.
*/
- return 0;
+ return false;
}
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- f2fs_update_dirty_page(inode, page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ f2fs_update_dirty_folio(inode, folio);
+ return true;
}
- return 0;
+ return false;
}
@@ -3929,19 +3723,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int end_sec = secidx + blkcnt / blk_per_sec;
int ret = 0;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
+ set_inode_flag(inode, FI_OPU_WRITE);
for (; secidx < end_sec; secidx++) {
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
f2fs_unlock_op(sbi);
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
struct page *page;
@@ -3949,7 +3744,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
page = f2fs_get_lock_data_page(inode, blkidx, true);
if (IS_ERR(page)) {
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
ret = PTR_ERR(page);
goto done;
}
@@ -3958,22 +3753,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
f2fs_put_page(page, 1);
}
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
ret = filemap_fdatawrite(inode->i_mapping);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
if (ret)
break;
}
done:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
+ clear_inode_flag(inode, FI_OPU_WRITE);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -4146,10 +3942,10 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
- .set_page_dirty = f2fs_set_data_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_data_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
- .direct_IO = f2fs_direct_IO,
+ .direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
.swap_activate = f2fs_swap_activate,
.swap_deactivate = f2fs_swap_deactivate,
@@ -4229,3 +4025,65 @@ void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
}
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct f2fs_map_blocks map = {};
+ pgoff_t next_pgofs = 0;
+ int err;
+
+ map.m_lblk = bytes_to_blks(inode, offset);
+ map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1;
+ map.m_next_pgofs = &next_pgofs;
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ if (flags & IOMAP_WRITE)
+ map.m_may_create = true;
+
+ err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE,
+ F2FS_GET_BLOCK_DIO);
+ if (err)
+ return err;
+
+ iomap->offset = blks_to_bytes(inode, map.m_lblk);
+
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
+ if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) {
+ iomap->length = blks_to_bytes(inode, map.m_len);
+ if (map.m_flags & F2FS_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags |= IOMAP_F_MERGED;
+ } else {
+ iomap->type = IOMAP_UNWRITTEN;
+ }
+ if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk)))
+ return -EINVAL;
+
+ iomap->bdev = map.m_bdev;
+ iomap->addr = blks_to_bytes(inode, map.m_pblk);
+ } else {
+ iomap->length = blks_to_bytes(inode, next_pgofs) -
+ iomap->offset;
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ }
+
+ if (map.m_flags & F2FS_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
+ if ((inode->i_state & I_DIRTY_DATASYNC) ||
+ offset + length > i_size_read(inode))
+ iomap->flags |= IOMAP_F_DIRTY;
+
+ return 0;
+}
+
+const struct iomap_ops f2fs_iomap_ops = {
+ .iomap_begin = f2fs_iomap_begin,
+};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8c50518475a9..fcdf253cd211 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_MUTEX(f2fs_stat_mutex);
+static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -338,14 +338,16 @@ static char *s_flag[] = {
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
[SBI_IS_RESIZEFS] = " resizefs",
+ [SBI_IS_FREEZING] = " freezefs",
};
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
@@ -474,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v)
si->node_segs, si->bg_node_segs);
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
"Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Low (%d)\n",
+ "Urgent High (%d), Urgent Mid (%d), "
+ "Urgent Low (%d)\n",
si->sbi->gc_reclaimed_segs[GC_NORMAL],
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
@@ -532,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
+ seq_printf(s, " - fsync mark: %4lld\n",
+ percpu_counter_sum_positive(
+ &si->sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@@ -573,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -584,6 +591,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -619,9 +627,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -629,10 +637,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_del(&si->stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
kfree(si);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1820e9c106f7..a0e51937d92e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -16,7 +16,7 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de)
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
if (IS_CASEFOLDED(dir)) {
@@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname)
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (fname->cf_name.name) {
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
return f2fs_find_target_dentry(&d, fname, max_slots);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
@@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir,
{
struct fscrypt_name f;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (fname->cf_name.name) {
struct qstr cf = FSTR_TO_QSTR(&fname->cf_name);
@@ -766,7 +766,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -793,7 +793,7 @@ add_dentry:
f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
f2fs_put_page(dentry_page, 1);
@@ -858,7 +858,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
struct page *page;
int err = 0;
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -869,7 +869,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return err;
}
@@ -877,7 +877,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
@@ -888,7 +888,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
f2fs_i_links_write(inode, false);
f2fs_i_size_write(inode, 0);
}
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
f2fs_add_orphan_inode(inode);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b339ae89c1ad..8c570de21ed5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,6 +28,8 @@
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
+struct pagevec;
+
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
#else
@@ -55,6 +57,8 @@ enum {
FAULT_DISCARD,
FAULT_WRITE_IO,
FAULT_SLAB_ALLOC,
+ FAULT_DQUOT_INIT,
+ FAULT_LOCK_OP,
FAULT_MAX,
};
@@ -119,6 +123,20 @@ typedef u32 nid_t;
#define COMPRESS_EXT_NUM 16
+/*
+ * An implementation of an rwsem that is explicitly unfair to readers. This
+ * prevents priority inversion when a low-priority reader acquires the read lock
+ * while sleeping on the write lock but the write lock is needed by
+ * higher-priority clients.
+ */
+
+struct f2fs_rwsem {
+ struct rw_semaphore internal_rwsem;
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_queue_head_t read_waiters;
+#endif
+};
+
struct f2fs_mount_info {
unsigned int opt;
int write_io_size_bits; /* Write IO size bits */
@@ -136,7 +154,6 @@ struct f2fs_mount_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
/* For which write hints are passed down to block layer */
- int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
@@ -382,6 +399,10 @@ struct discard_cmd_control {
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
+ unsigned int max_discard_request; /* max. discard request per round */
+ unsigned int min_discard_issue_time; /* min. interval between discard issue */
+ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
+ unsigned int max_discard_issue_time; /* max. interval between discard issue */
unsigned int discard_granularity; /* discard granularity */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
@@ -484,7 +505,7 @@ struct f2fs_filename {
*/
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* For casefolded directories: the casefolded name, but it's left NULL
* if the original name is not valid Unicode, if the directory is both
@@ -557,16 +578,25 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+/* maximum retry of EIO'ed meta page */
+#define MAX_RETRY_META_PAGE_EIO 100
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
+/* dirty segments threshold for triggering CP */
+#define DEFAULT_DIRTY_THRESHOLD 4
+
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -617,6 +647,7 @@ struct extent_tree {
F2FS_MAP_UNWRITTEN)
struct f2fs_map_blocks {
+ struct block_device *m_bdev; /* for multi-device dio */
block_t m_pblk;
block_t m_lblk;
unsigned int m_len;
@@ -625,6 +656,7 @@ struct f2fs_map_blocks {
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type;
bool m_may_create; /* indicate it is from write path */
+ bool m_multidev_dio; /* indicate it allows multi-device dio */
};
/* for flag in get_data_block */
@@ -648,6 +680,7 @@ enum {
#define FADVISE_KEEP_SIZE_BIT 0x10
#define FADVISE_HOT_BIT 0x20
#define FADVISE_VERITY_BIT 0x40
+#define FADVISE_TRUNC_BIT 0x80
#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT)
@@ -675,6 +708,10 @@ enum {
#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT)
#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT)
+#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT)
+#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT)
+#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT)
+
#define DEF_DIR_LEVEL 0
enum {
@@ -707,9 +744,10 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_SKIP_WRITES, /* should skip data page writeback */
+ FI_OPU_WRITE, /* used for opu per file */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
- FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */
FI_HOT_DATA, /* indicate file is hot */
FI_EXTRA_ATTR, /* indicate file has extra attribute */
FI_PROJ_INHERIT, /* indicate file inherits projectid */
@@ -738,7 +776,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
- struct rw_semaphore i_sem; /* protect fi info */
+ struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -763,8 +801,8 @@ struct f2fs_inode_info {
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
- struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+ struct f2fs_rwsem i_gc_rwsem[2];
+ struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
kprojid_t i_projid; /* id for project quota */
@@ -883,6 +921,7 @@ struct f2fs_nm_info {
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ nid_t max_rf_node_blocks; /* max # of nodes for recovery */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
@@ -890,7 +929,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat entry tree */
+ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1003,7 +1042,7 @@ struct f2fs_sm_info {
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
- struct rw_semaphore curseg_lock; /* for preventing curseg change */
+ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
@@ -1012,6 +1051,7 @@ struct f2fs_sm_info {
unsigned int segment_count; /* total # of segments */
unsigned int main_segments; /* # of segments in main area */
unsigned int reserved_segments; /* # of reserved segments */
+ unsigned int additional_reserved_segments;/* reserved segs for IO align feature */
unsigned int ovp_segments; /* # of overprovision segments */
/* a threshold to reclaim prefree segments */
@@ -1186,11 +1226,11 @@ struct f2fs_bio_info {
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
- struct rw_semaphore io_rwsem; /* blocking op for bio */
+ struct f2fs_rwsem io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
struct list_head bio_list; /* bio entry list head */
- struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
+ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1252,6 +1292,7 @@ enum {
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
SBI_IS_RESIZEFS, /* resizefs is in process */
+ SBI_IS_FREEZING, /* freezefs is in process */
};
enum {
@@ -1271,6 +1312,7 @@ enum {
GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
+ GC_URGENT_MID,
MAX_GC_MODE,
};
@@ -1284,14 +1326,10 @@ enum {
};
enum {
- FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
- FS_MODE_LFS, /* use lfs allocation only */
-};
-
-enum {
- WHINT_MODE_OFF, /* not pass down write hints */
- WHINT_MODE_USER, /* try to pass down hints given by users */
- WHINT_MODE_FS, /* pass down hints with F2FS policy */
+ FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
+ FS_MODE_LFS, /* use lfs allocation only */
+ FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */
+ FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */
};
enum {
@@ -1478,6 +1516,7 @@ struct compress_ctx {
unsigned int nr_rpages; /* total page number in rpages */
struct page **cpages; /* pages store compressed data in cluster */
unsigned int nr_cpages; /* total page number in cpages */
+ unsigned int valid_nr_cpages; /* valid page number in cpages */
void *rbuf; /* virtual mapped address on rpages */
struct compress_data *cbuf; /* virtual mapped address on cpages */
size_t rlen; /* valid data length in rbuf */
@@ -1553,7 +1592,7 @@ struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- struct rw_semaphore sb_lock; /* lock for raw super block */
+ struct f2fs_rwsem sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
struct mutex writepages; /* mutex for writepages() */
@@ -1573,18 +1612,20 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
- struct rw_semaphore io_order_lock;
+ struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
+ pgoff_t metapage_eio_ofs; /* EIO page offset */
+ int metapage_eio_cnt; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
- struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
- struct rw_semaphore node_write; /* locking node writes */
- struct rw_semaphore node_change; /* locking node change */
+ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */
+ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */
+ struct f2fs_rwsem node_write; /* locking node writes */
+ struct f2fs_rwsem node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -1644,12 +1685,14 @@ struct f2fs_sb_info {
block_t unusable_block_count; /* # of blocks saved by last cp */
unsigned int nquota_files; /* # of quota sysfile */
- struct rw_semaphore quota_sem; /* blocking cp for flags */
+ struct f2fs_rwsem quota_sem; /* blocking cp for flags */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* # of node block writes as roll forward recovery */
+ struct percpu_counter rf_node_block_count;
/* writeback control */
atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */
@@ -1660,7 +1703,7 @@ struct f2fs_sb_info {
struct f2fs_mount_info mount_opt; /* mount options */
/* for cleaning operations */
- struct rw_semaphore gc_lock; /*
+ struct f2fs_rwsem gc_lock; /*
* semaphore for GC, avoid
* race between GC and GC or CP
*/
@@ -1669,6 +1712,9 @@ struct f2fs_sb_info {
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
+ spinlock_t gc_urgent_high_lock;
+ bool gc_urgent_high_limited; /* indicates having limited trial count */
+ unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */
/* for skip statistic */
unsigned int atomic_files; /* # of opened atomic file */
@@ -1677,7 +1723,7 @@ struct f2fs_sb_info {
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
- struct rw_semaphore pin_sem;
+ struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -1728,12 +1774,15 @@ struct f2fs_sb_info {
/* For shrinker support */
struct list_head s_list;
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
+
+ /* For multi devices */
int s_ndevs; /* number of devices */
struct f2fs_dev_info *devs; /* for device list */
unsigned int dirty_device; /* for checkpoint data flush */
spinlock_t dev_lock; /* protect dirty_device */
- struct mutex umount_mutex;
- unsigned int shrinker_run_no;
+ bool aligned_blksize; /* all devices has the same logical blksize */
/* For write statistics */
u64 sectors_written_start;
@@ -1756,6 +1805,9 @@ struct f2fs_sb_info {
unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */
+ int max_fragment_chunk; /* max chunk size for block fragmentation mode */
+ int max_fragment_hole; /* max hole size for block fragmentation mode */
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -1787,13 +1839,6 @@ struct f2fs_sb_info {
#endif
};
-struct f2fs_private_dio {
- struct inode *inode;
- void *orig_private;
- bio_end_io_t *orig_end_io;
- bool write;
-};
-
#ifdef CONFIG_F2FS_FAULT_INJECTION
#define f2fs_show_injection_info(sbi, type) \
printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \
@@ -2072,29 +2117,105 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
+#define init_f2fs_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_f2fs_rwsem((sem), #sem, &__key); \
+} while (0)
+
+static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key)
+{
+ __init_rwsem(&sem->internal_rwsem, sem_name, key);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ init_waitqueue_head(&sem->read_waiters);
+#endif
+}
+
+static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_locked(&sem->internal_rwsem);
+}
+
+static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_contended(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_read(struct f2fs_rwsem *sem)
+{
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem));
+#else
+ down_read(&sem->internal_rwsem);
+#endif
+}
+
+static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
+{
+ return down_read_trylock(&sem->internal_rwsem);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_read_nested(&sem->internal_rwsem, subclass);
+}
+#else
+#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
+#endif
+
+static inline void f2fs_up_read(struct f2fs_rwsem *sem)
+{
+ up_read(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_write(struct f2fs_rwsem *sem)
+{
+ down_write(&sem->internal_rwsem);
+}
+
+static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
+{
+ return down_write_trylock(&sem->internal_rwsem);
+}
+
+static inline void f2fs_up_write(struct f2fs_rwsem *sem)
+{
+ up_write(&sem->internal_rwsem);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wake_up_all(&sem->read_waiters);
+#endif
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ f2fs_down_read(&sbi->cp_rwsem);
}
static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
{
- return down_read_trylock(&sbi->cp_rwsem);
+ if (time_to_inject(sbi, FAULT_LOCK_OP)) {
+ f2fs_show_injection_info(sbi, FAULT_LOCK_OP);
+ return 0;
+ }
+ return f2fs_down_read_trylock(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ f2fs_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ f2fs_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ f2fs_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -2184,6 +2305,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, true))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (F2FS_IO_ALIGNED(sbi))
+ avail_user_block_count -= sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments;
+
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
if (avail_user_block_count > sbi->unusable_block_count)
avail_user_block_count -= sbi->unusable_block_count;
@@ -2430,6 +2556,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, false))
valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (F2FS_IO_ALIGNED(sbi))
+ valid_block_count += sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments;
+
user_block_count = sbi->user_block_count;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
user_block_count -= sbi->unusable_block_count;
@@ -2647,6 +2778,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (is_inflight_io(sbi, type))
return false;
+ if (sbi->gc_mode == GC_URGENT_MID)
+ return true;
+
if (sbi->gc_mode == GC_URGENT_LOW &&
(type == DISCARD_TIME || type == GC_TIME))
return true;
@@ -3102,12 +3236,16 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
+ if (is_file(inode, type))
+ return;
F2FS_I(inode)->i_advise |= type;
f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void clear_file(struct inode *inode, int type)
{
+ if (!is_file(inode, type))
+ return;
F2FS_I(inode)->i_advise &= ~type;
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -3363,6 +3501,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
*/
int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
+int f2fs_dquot_initialize(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
@@ -3391,7 +3530,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
- struct node_info *ni);
+ struct node_info *ni, bool checkpoint_context);
pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
@@ -3492,6 +3631,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+ block_t blkaddr, unsigned int blkcnt);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered, bool locked);
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
@@ -3509,13 +3650,21 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
unsigned int segno);
+#define DEF_FRAGMENT_SIZE 4
+#define MIN_FRAGMENT_SIZE 1
+#define MAX_FRAGMENT_SIZE 512
+
+static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG ||
+ F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK;
+}
+
/*
* checkpoint.c
*/
@@ -3528,7 +3677,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks);
long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write, enum iostat_type io_type);
void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
@@ -3546,7 +3696,7 @@ void f2fs_add_orphan_inode(struct inode *inode);
void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
-void f2fs_update_dirty_page(struct inode *inode, struct page *page);
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
@@ -3580,14 +3730,13 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio);
+ block_t blk_addr, sector_t *sector);
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
-int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
int op_flags, bool for_write);
@@ -3610,8 +3759,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
struct writeback_control *wbc,
enum iostat_type io_type,
int compr_blocks, bool allow_balance);
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length);
+void f2fs_write_failed(struct inode *inode, loff_t to);
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int f2fs_release_page(struct page *page, gfp_t wait);
#ifdef CONFIG_MIGRATION
int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
@@ -3623,6 +3772,7 @@ int f2fs_init_post_read_processing(void);
void f2fs_destroy_post_read_processing(void);
int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi);
void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
+extern const struct iomap_ops f2fs_iomap_ops;
/*
* gc.c
@@ -4027,6 +4177,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
block_t blkaddr);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
+ int index, int nr_pages);
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -4152,8 +4304,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
if (!f2fs_compressed_file(inode))
return true;
- if (S_ISREG(inode->i_mode) &&
- (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks)))
+ if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
return false;
fi->i_flags &= ~F2FS_COMPR_FL;
@@ -4302,15 +4453,31 @@ static inline int block_unaligned_IO(struct inode *inode,
return align & blocksize_mask;
}
+static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
+ int flag)
+{
+ if (!f2fs_is_multi_device(sbi))
+ return false;
+ if (flag != F2FS_GET_BLOCK_DIO)
+ return false;
+ return sbi->aligned_blksize;
+}
+
static inline bool f2fs_force_buffered_io(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- if (f2fs_post_read_required(inode))
+ if (!fscrypt_dio_supported(iocb, iter))
+ return true;
+ if (fsverity_active(inode))
return true;
- if (f2fs_is_multi_device(sbi))
+ if (f2fs_compressed_file(inode))
+ return true;
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
/*
* for blkzoned device, fallback direct IO to buffered IO, so
@@ -4361,6 +4528,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
}
+static inline void f2fs_io_schedule_timeout(long timeout)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(timeout);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c8ef33bd8d3..5b89af0f27f0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -24,6 +24,7 @@
#include <linux/sched/signal.h>
#include <linux/fileattr.h>
#include <linux/fadvise.h>
+#include <linux/iomap.h>
#include "f2fs.h"
#include "node.h"
@@ -236,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t pino;
- down_write(&fi->i_sem);
+ f2fs_down_write(&fi->i_sem);
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
}
- up_write(&fi->i_sem);
+ f2fs_up_write(&fi->i_sem);
}
static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
@@ -317,9 +318,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&F2FS_I(inode)->i_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
cp_reason = need_do_checkpoint(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
if (cp_reason) {
/* all the dirty node pages should be flushed for POR */
@@ -786,7 +787,7 @@ int f2fs_truncate(struct inode *inode)
return -EIO;
}
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -811,7 +812,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri;
+ struct f2fs_inode *ri = NULL;
unsigned int flags;
if (f2fs_has_extra_attr(inode) &&
@@ -843,7 +844,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -903,7 +904,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -916,7 +917,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
if (is_quota_modification(inode, attr)) {
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
}
@@ -957,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -969,7 +970,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* larger than i_size.
*/
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -979,10 +980,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
- __setattr_copy(&init_user_ns, inode, attr);
+ __setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1111,7 +1112,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache_range(inode, blk_start, blk_end - 1);
@@ -1121,7 +1122,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1232,7 +1233,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
if (ret)
return ret;
- ret = f2fs_get_node_info(sbi, dn.nid, &ni);
+ ret = f2fs_get_node_info(sbi, dn.nid, &ni, false);
if (ret) {
f2fs_put_dnode(&dn);
return ret;
@@ -1354,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
@@ -1364,7 +1365,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1499,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
unsigned int end_offset;
pgoff_t end;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
@@ -1513,7 +1514,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret) {
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1525,7 +1526,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1599,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
@@ -1617,7 +1618,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
}
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
@@ -1673,13 +1674,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
@@ -1687,8 +1688,9 @@ next_alloc:
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ file_dont_truncate(inode);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
expanded += map.m_len;
sec_len -= map.m_len;
@@ -1748,7 +1750,11 @@ static long f2fs_fallocate(struct file *file, int mode,
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
- if (f2fs_compressed_file(inode) &&
+ /*
+ * Pinned file should not support partial trucation since the block
+ * can be used by applications.
+ */
+ if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1983,11 +1989,12 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2002,7 +2009,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode_lock(inode);
- f2fs_disable_compressed_file(inode);
+ if (!f2fs_disable_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
@@ -2014,7 +2024,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (ret)
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
@@ -2025,7 +2035,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -2038,7 +2048,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
/* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
F2FS_I(inode)->inmem_task = current;
@@ -2052,9 +2062,10 @@ out:
static int f2fs_ioc_commit_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2094,9 +2105,10 @@ err_out:
static int f2fs_ioc_start_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2129,9 +2141,10 @@ out:
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2158,9 +2171,10 @@ out:
static int f2fs_ioc_abort_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2345,7 +2359,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err)
return err;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
goto got_it;
@@ -2364,7 +2378,7 @@ got_it:
16))
err = -EFAULT;
out_err:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
return err;
}
@@ -2441,12 +2455,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
@@ -2477,12 +2491,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
do_more:
if (!range->sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, range->sync, true, false,
@@ -2553,10 +2567,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- /* if in-place-update policy is enabled, don't waste time here */
- if (f2fs_should_update_inplace(inode, NULL))
- return -EINVAL;
-
pg_start = range->start >> PAGE_SHIFT;
pg_end = (range->start + range->len) >> PAGE_SHIFT;
@@ -2564,6 +2574,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
inode_lock(inode);
+ /* if in-place-update policy is enabled, don't waste time here */
+ set_inode_flag(inode, FI_OPU_WRITE);
+ if (f2fs_should_update_inplace(inode, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range(inode->i_mapping, range->start,
range->start + range->len - 1);
@@ -2645,7 +2662,7 @@ do_map:
goto check;
}
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -2670,15 +2687,16 @@ check:
if (map.m_lblk < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
out:
+ clear_inode_flag(inode, FI_OPU_WRITE);
inode_unlock(inode);
if (!err)
range->len = (u64)total << PAGE_SHIFT;
@@ -2814,10 +2832,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
if (src != dst) {
ret = -EBUSY;
- if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+ if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
goto out_src;
}
@@ -2835,9 +2853,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_unlock_op(sbi);
if (src != dst)
- up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
out_src:
- up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
out_unlock:
if (src != dst)
inode_unlock(dst);
@@ -2932,7 +2950,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
@@ -2984,7 +3002,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *ipage;
+ struct f2fs_inode *ri = NULL;
kprojid_t kprojid;
int err;
@@ -3008,19 +3026,10 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (IS_NOQUOTA(inode))
return err;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ return -EOVERFLOW;
- if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
- i_projid)) {
- err = -EOVERFLOW;
- f2fs_put_page(ipage, 1);
- return err;
- }
- f2fs_put_page(ipage, 1);
-
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -3143,17 +3152,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
inode_lock(inode);
- if (f2fs_should_update_outplace(inode, NULL)) {
- ret = -EINVAL;
- goto out;
- }
-
if (!pin) {
clear_inode_flag(inode, FI_PIN_FILE);
f2fs_i_gc_failures_write(inode, 0);
goto done;
}
+ if (f2fs_should_update_outplace(inode, NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (f2fs_pin_file_control(inode, false)) {
ret = -EAGAIN;
goto out;
@@ -3209,9 +3218,9 @@ int f2fs_precache_extents(struct inode *inode)
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -3288,11 +3297,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
if (!vbuf)
return -ENOMEM;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
count = utf16s_to_utf8s(sbi->raw_super->volume_name,
ARRAY_SIZE(sbi->raw_super->volume_name),
UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME);
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (copy_to_user((char __user *)arg, vbuf,
min(FSLABEL_MAX, count)))
@@ -3320,7 +3329,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
if (err)
goto out;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
memset(sbi->raw_super->volume_name, 0,
sizeof(sbi->raw_super->volume_name));
@@ -3330,7 +3339,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
err = f2fs_commit_super(sbi, false);
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
out:
@@ -3456,7 +3465,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3493,7 +3502,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
inode_unlock(inode);
@@ -3609,7 +3618,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3646,7 +3655,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3764,7 +3773,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (ret)
goto err;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
@@ -3853,7 +3862,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
prev_block, len, range.flags);
out:
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
file_end_write(filp);
@@ -4218,133 +4227,436 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return __f2fs_ioctl(filp, cmd, arg);
}
-static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Return %true if the given read or write request should use direct I/O, or
+ * %false if it should use buffered I/O.
+ */
+static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ unsigned int align;
+
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return false;
+
+ if (f2fs_force_buffered_io(inode, iocb, iter))
+ return false;
+
+ /*
+ * Direct I/O not aligned to the disk's logical_block_size will be
+ * attempted, but will fail with -EINVAL.
+ *
+ * f2fs additionally requires that direct I/O be aligned to the
+ * filesystem block size, which is often a stricter requirement.
+ * However, f2fs traditionally falls back to buffered I/O on requests
+ * that are logical_block_size-aligned but not fs-block aligned.
+ *
+ * The below logic implements this behavior.
+ */
+ align = iocb->ki_pos | iov_iter_alignment(iter);
+ if (!IS_ALIGNED(align, i_blocksize(inode)) &&
+ IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev)))
+ return false;
+
+ return true;
+}
+
+static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error,
+ unsigned int flags)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp));
+
+ dec_page_count(sbi, F2FS_DIO_READ);
+ if (error)
+ return error;
+ f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size);
+ return 0;
+}
+
+static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = {
+ .end_io = f2fs_dio_read_end_io,
+};
+
+static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- int ret;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ const loff_t pos = iocb->ki_pos;
+ const size_t count = iov_iter_count(to);
+ struct iomap_dio *dio;
+ ssize_t ret;
+
+ if (count == 0)
+ return 0; /* skip atime update */
+
+ trace_f2fs_direct_IO_enter(inode, iocb, count, READ);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ } else {
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
+ }
+
+ /*
+ * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
+ * the higher-level function iomap_dio_rw() in order to ensure that the
+ * F2FS_DIO_READ counter will be decremented correctly in all cases.
+ */
+ inc_page_count(sbi, F2FS_DIO_READ);
+ dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
+ &f2fs_iomap_dio_read_ops, 0, 0);
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ if (ret != -EIOCBQUEUED)
+ dec_page_count(sbi, F2FS_DIO_READ);
+ } else {
+ ret = iomap_dio_complete(dio);
+ }
+
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+
+ file_accessed(file);
+out:
+ trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret);
+ return ret;
+}
+
+static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- ret = generic_file_read_iter(iocb, iter);
+ if (f2fs_should_use_dio(inode, iocb, to))
+ return f2fs_dio_read_iter(iocb, to);
+ ret = filemap_read(iocb, to, 0);
if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret);
-
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
return ret;
}
-static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- ssize_t ret;
+ ssize_t count;
+ int err;
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
- ret = -EIO;
- goto out;
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+ return -EPERM;
+
+ count = generic_write_checks(iocb, from);
+ if (count <= 0)
+ return count;
+
+ err = file_modified(file);
+ if (err)
+ return err;
+ return count;
+}
+
+/*
+ * Preallocate blocks for a write request, if it is possible and helpful to do
+ * so. Returns a positive number if blocks may have been preallocated, 0 if no
+ * blocks were preallocated, or a negative errno value if something went
+ * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the
+ * requested blocks (not just some of them) have been allocated.
+ */
+static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
+ bool dio)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ const loff_t pos = iocb->ki_pos;
+ const size_t count = iov_iter_count(iter);
+ struct f2fs_map_blocks map = {};
+ int flag;
+ int ret;
+
+ /* If it will be an out-of-place direct write, don't bother. */
+ if (dio && f2fs_lfs_mode(sbi))
+ return 0;
+ /*
+ * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into
+ * buffered IO, if DIO meets any holes.
+ */
+ if (dio && i_size_read(inode) &&
+ (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode))))
+ return 0;
+
+ /* No-wait I/O can't allocate blocks. */
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return 0;
+
+ /* If it will be a short write, don't bother. */
+ if (fault_in_iov_iter_readable(iter, count))
+ return 0;
+
+ if (f2fs_has_inline_data(inode)) {
+ /* If the data will fit inline, don't bother. */
+ if (pos + count <= MAX_INLINE_DATA(inode))
+ return 0;
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
}
- if (!f2fs_is_compress_backend_ready(inode)) {
- ret = -EOPNOTSUPP;
- goto out;
+ /* Do not preallocate blocks that will be written partially in 4KB. */
+ map.m_lblk = F2FS_BLK_ALIGN(pos);
+ map.m_len = F2FS_BYTES_TO_BLK(pos + count);
+ if (map.m_len > map.m_lblk)
+ map.m_len -= map.m_lblk;
+ else
+ map.m_len = 0;
+ map.m_may_create = true;
+ if (dio) {
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ flag = F2FS_GET_BLOCK_PRE_DIO;
+ } else {
+ map.m_seg_type = NO_CHECK_TYPE;
+ flag = F2FS_GET_BLOCK_PRE_AIO;
}
+ ret = f2fs_map_blocks(inode, &map, 1, flag);
+ /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */
+ if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0))
+ return ret;
+ if (ret == 0)
+ set_inode_flag(inode, FI_PREALLOCATED_ALL);
+ return map.m_len;
+}
+
+static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb, from);
+ current->backing_dev_info = NULL;
+
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret);
+ }
+ return ret;
+}
+
+static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
+ unsigned int flags)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp));
+
+ dec_page_count(sbi, F2FS_DIO_WRITE);
+ if (error)
+ return error;
+ f2fs_update_iostat(sbi, APP_DIRECT_IO, size);
+ return 0;
+}
+
+static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
+ .end_io = f2fs_dio_write_end_io,
+};
+
+static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ bool *may_need_sync)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ const bool do_opu = f2fs_lfs_mode(sbi);
+ const loff_t pos = iocb->ki_pos;
+ const ssize_t count = iov_iter_count(from);
+ unsigned int dio_flags;
+ struct iomap_dio *dio;
+ ssize_t ret;
+
+ trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE);
+
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode)) {
+ /* f2fs_convert_inline_inode() and block allocation can block */
+ if (f2fs_has_inline_data(inode) ||
+ !f2fs_overwrite_io(inode, pos, count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
ret = -EAGAIN;
goto out;
}
} else {
- inode_lock(inode);
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto out;
- if (unlikely(IS_IMMUTABLE(inode))) {
- ret = -EPERM;
- goto unlock;
+ f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
+ if (do_opu)
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
- ret = -EPERM;
- goto unlock;
+ /*
+ * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
+ * the higher-level function iomap_dio_rw() in order to ensure that the
+ * F2FS_DIO_WRITE counter will be decremented correctly in all cases.
+ */
+ inc_page_count(sbi, F2FS_DIO_WRITE);
+ dio_flags = 0;
+ if (pos + count > inode->i_size)
+ dio_flags |= IOMAP_DIO_FORCE_WAIT;
+ dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
+ &f2fs_iomap_dio_write_ops, dio_flags, 0);
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EIOCBQUEUED)
+ dec_page_count(sbi, F2FS_DIO_WRITE);
+ } else {
+ ret = iomap_dio_complete(dio);
}
- ret = generic_write_checks(iocb, from);
- if (ret > 0) {
- bool preallocated = false;
- size_t target_size = 0;
- int err;
-
- if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
- set_inode_flag(inode, FI_NO_PREALLOC);
-
- if ((iocb->ki_flags & IOCB_NOWAIT)) {
- if (!f2fs_overwrite_io(inode, iocb->ki_pos,
- iov_iter_count(from)) ||
- f2fs_has_inline_data(inode) ||
- f2fs_force_buffered_io(inode, iocb, from)) {
- clear_inode_flag(inode, FI_NO_PREALLOC);
- inode_unlock(inode);
- ret = -EAGAIN;
+ if (do_opu)
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
+
+ if (ret < 0)
+ goto out;
+ if (pos + ret > inode->i_size)
+ f2fs_i_size_write(inode, pos + ret);
+ if (!do_opu)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+
+ if (iov_iter_count(from)) {
+ ssize_t ret2;
+ loff_t bufio_start_pos = iocb->ki_pos;
+
+ /*
+ * The direct write was partial, so we need to fall back to a
+ * buffered write for the remainder.
+ */
+
+ ret2 = f2fs_buffered_write_iter(iocb, from);
+ if (iov_iter_count(from))
+ f2fs_write_failed(inode, iocb->ki_pos);
+ if (ret2 < 0)
+ goto out;
+
+ /*
+ * Ensure that the pagecache pages are written to disk and
+ * invalidated to preserve the expected O_DIRECT semantics.
+ */
+ if (ret2 > 0) {
+ loff_t bufio_end_pos = bufio_start_pos + ret2 - 1;
+
+ ret += ret2;
+
+ ret2 = filemap_write_and_wait_range(file->f_mapping,
+ bufio_start_pos,
+ bufio_end_pos);
+ if (ret2 < 0)
goto out;
- }
- goto write;
+ invalidate_mapping_pages(file->f_mapping,
+ bufio_start_pos >> PAGE_SHIFT,
+ bufio_end_pos >> PAGE_SHIFT);
}
+ } else {
+ /* iomap_dio_rw() already handled the generic_write_sync(). */
+ *may_need_sync = false;
+ }
+out:
+ trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret);
+ return ret;
+}
- if (is_inode_flag_set(inode, FI_NO_PREALLOC))
- goto write;
+static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ const loff_t orig_pos = iocb->ki_pos;
+ const size_t orig_count = iov_iter_count(from);
+ loff_t target_size;
+ bool dio;
+ bool may_need_sync = true;
+ int preallocated;
+ ssize_t ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- /*
- * Convert inline data for Direct I/O before entering
- * f2fs_direct_IO().
- */
- err = f2fs_convert_inline_inode(inode);
- if (err)
- goto out_err;
- /*
- * If force_buffere_io() is true, we have to allocate
- * blocks all the time, since f2fs_direct_IO will fall
- * back to buffered IO.
- */
- if (!f2fs_force_buffered_io(inode, iocb, from) &&
- f2fs_lfs_mode(F2FS_I_SB(inode)))
- goto write;
- }
- preallocated = true;
- target_size = iocb->ki_pos + iov_iter_count(from);
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
+ ret = -EIO;
+ goto out;
+ }
- err = f2fs_preallocate_blocks(iocb, from);
- if (err) {
-out_err:
- clear_inode_flag(inode, FI_NO_PREALLOC);
- inode_unlock(inode);
- ret = err;
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode)) {
+ ret = -EAGAIN;
goto out;
}
-write:
- ret = __generic_file_write_iter(iocb, from);
- clear_inode_flag(inode, FI_NO_PREALLOC);
+ } else {
+ inode_lock(inode);
+ }
- /* if we couldn't write data, we should deallocate blocks. */
- if (preallocated && i_size_read(inode) < target_size) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_truncate(inode);
- filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- }
+ ret = f2fs_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out_unlock;
- if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
+ /* Determine whether we will do a direct write or a buffered write. */
+ dio = f2fs_should_use_dio(inode, iocb, from);
+
+ /* Possibly preallocate the blocks for the write. */
+ target_size = iocb->ki_pos + iov_iter_count(from);
+ preallocated = f2fs_preallocate_blocks(iocb, from, dio);
+ if (preallocated < 0)
+ ret = preallocated;
+ else
+ /* Do the actual write. */
+ ret = dio ?
+ f2fs_dio_write_iter(iocb, from, &may_need_sync):
+ f2fs_buffered_write_iter(iocb, from);
+
+ /* Don't leave any preallocated blocks around past i_size. */
+ if (preallocated && i_size_read(inode) < target_size) {
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ filemap_invalidate_lock(inode->i_mapping);
+ if (!f2fs_truncate(inode))
+ file_dont_truncate(inode);
+ filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ } else {
+ file_dont_truncate(inode);
}
-unlock:
+
+ clear_inode_flag(inode, FI_PREALLOCATED_ALL);
+out_unlock:
inode_unlock(inode);
out:
- trace_f2fs_file_write_iter(inode, iocb->ki_pos,
- iov_iter_count(from), ret);
- if (ret > 0)
+ trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret);
+ if (ret > 0 && may_need_sync)
ret = generic_write_sync(iocb, ret);
return ret;
}
@@ -4352,12 +4664,12 @@ out:
static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
int advice)
{
- struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
+ struct inode *inode = file_inode(filp);
+ int err;
if (advice == POSIX_FADV_SEQUENTIAL) {
- inode = file_inode(filp);
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
@@ -4374,7 +4686,13 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
return 0;
}
- return generic_fadvise(filp, offset, len, advice);
+ err = generic_fadvise(filp, offset, len, advice);
+ if (!err && advice == POSIX_FADV_DONTNEED &&
+ test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) &&
+ f2fs_compressed_file(inode))
+ f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino);
+
+ return err;
}
#ifdef CONFIG_COMPAT
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 77391e3b7d68..ea5b93b689cd 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -7,13 +7,14 @@
*/
#include <linux/fs.h>
#include <linux/module.h>
-#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/f2fs_fs.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
#include "f2fs.h"
#include "node.h"
@@ -91,22 +92,37 @@ static int gc_thread_func(void *data)
* So, I'd like to wait some time to collect dirty segments.
*/
if (sbi->gc_mode == GC_URGENT_HIGH) {
+ spin_lock(&sbi->gc_urgent_high_lock);
+ if (sbi->gc_urgent_high_limited) {
+ if (!sbi->gc_urgent_high_remaining) {
+ sbi->gc_urgent_high_limited = false;
+ spin_unlock(&sbi->gc_urgent_high_lock);
+ sbi->gc_mode = GC_NORMAL;
+ continue;
+ }
+ sbi->gc_urgent_high_remaining--;
+ }
+ spin_unlock(&sbi->gc_urgent_high_lock);
+ }
+
+ if (sbi->gc_mode == GC_URGENT_HIGH ||
+ sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
}
if (foreground) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
- } else if (!down_write_trylock(&sbi->gc_lock)) {
+ } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -257,7 +273,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->max_search = sbi->max_victim_search;
/* let's select beginning hot/small space first in no_heap mode*/
- if (test_opt(sbi, NOHEAP) &&
+ if (f2fs_need_rand_seg(sbi))
+ p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
else
@@ -944,7 +962,7 @@ next_step:
continue;
}
- if (f2fs_get_node_info(sbi, nid, &ni)) {
+ if (f2fs_get_node_info(sbi, nid, &ni, false)) {
f2fs_put_page(node_page, 1);
continue;
}
@@ -1012,7 +1030,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (IS_ERR(node_page))
return false;
- if (f2fs_get_node_info(sbi, nid, dni)) {
+ if (f2fs_get_node_info(sbi, nid, dni, false)) {
f2fs_put_page(node_page, 1);
return false;
}
@@ -1023,6 +1041,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
+ return false;
+ }
+
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
@@ -1036,7 +1059,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) {
f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u",
blkaddr, source_blkaddr, segno);
- f2fs_bug_on(sbi, 1);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
}
#endif
@@ -1203,7 +1226,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
- err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
+ err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
if (err)
goto put_out;
@@ -1212,7 +1235,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
- down_write(&fio.sbi->io_order_lock);
+ f2fs_down_write(&fio.sbi->io_order_lock);
mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
@@ -1298,7 +1321,7 @@ recover_block:
true, true, true);
up_out:
if (lfs_mode)
- up_write(&fio.sbi->io_order_lock);
+ f2fs_up_write(&fio.sbi->io_order_lock);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -1372,8 +1395,7 @@ retry:
if (err) {
clear_page_private_gcing(page);
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
if (is_dirty)
@@ -1454,10 +1476,11 @@ next_step:
if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode) ||
+ special_file(inode->i_mode))
continue;
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
sbi->skipped_gc_rwsem++;
@@ -1470,7 +1493,7 @@ next_step:
if (f2fs_post_read_required(inode)) {
int err = ra_data_block(inode, start_bidx);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err) {
iput(inode);
continue;
@@ -1481,7 +1504,7 @@ next_step:
data_page = f2fs_get_read_data_page(inode,
start_bidx, REQ_RAHEAD, true);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -1500,14 +1523,14 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) {
sbi->skipped_gc_rwsem++;
continue;
}
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
@@ -1530,8 +1553,8 @@ next_step:
submitted++;
if (locked) {
- up_write(&fi->i_gc_rwsem[WRITE]);
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
@@ -1789,7 +1812,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
@@ -1918,7 +1941,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
long long block_count;
int segs = secs * sbi->segs_per_sec;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
section_count = le32_to_cpu(raw_sb->section_count);
segment_count = le32_to_cpu(raw_sb->segment_count);
@@ -1939,7 +1962,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
cpu_to_le32(dev_segs + segs);
}
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -2013,7 +2036,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
/* stop other GC */
- if (!down_write_trylock(&sbi->gc_lock))
+ if (!f2fs_down_write_trylock(&sbi->gc_lock))
return -EAGAIN;
/* stop CP to protect MAIN_SEC in free_segment_range */
@@ -2033,15 +2056,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
out_unlock:
f2fs_unlock_op(sbi);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
return err;
set_sbi_flag(sbi, SBI_IS_RESIZEFS);
freeze_super(sbi->sb);
- down_write(&sbi->gc_lock);
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->cp_global_sem);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2086,8 +2109,8 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- up_write(&sbi->cp_global_sem);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index e3beac546c63..3cb1e7a24740 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
return;
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (IS_CASEFOLDED(dir)) {
/*
* If the casefolded name is provided, hash it instead of the
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 56a20d5c15da..a578bf83b803 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -131,7 +131,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
if (err)
return err;
- err = f2fs_get_node_info(fio.sbi, dn->nid, &ni);
+ err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false);
if (err) {
f2fs_truncate_data_blocks_range(dn, 1);
f2fs_put_dnode(dn);
@@ -192,7 +192,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
return 0;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -629,7 +629,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -658,7 +658,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
f2fs_put_page(ipage, 1);
return err;
@@ -786,7 +786,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
ilen = start + len;
ilen -= start;
- err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false);
if (err)
goto out;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9141147b5bb0..83639238a1fe 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -8,8 +8,8 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
-#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/sched/mm.h>
#include "f2fs.h"
#include "node.h"
@@ -516,6 +516,11 @@ make_now:
} else if (ino == F2FS_COMPRESS_INO(sbi)) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
inode->i_mapping->a_ops = &f2fs_compress_aops;
+ /*
+ * generic_error_remove_page only truncates pages of regular
+ * inode
+ */
+ inode->i_mode |= S_IFREG;
#endif
mapping_set_gfp_mask(inode->i_mapping,
GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
@@ -527,7 +532,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (S_ISLNK(inode->i_mode)) {
if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@@ -544,6 +549,15 @@ make_now:
goto bad_inode;
}
f2fs_set_inode_flags(inode);
+
+ if (file_should_truncate(inode) &&
+ !is_sbi_flag_set(sbi, SBI_POR_DOING)) {
+ ret = f2fs_truncate(inode);
+ if (ret)
+ goto bad_inode;
+ file_dont_truncate(inode);
+ }
+
unlock_new_inode(inode);
trace_f2fs_iget(inode);
return inode;
@@ -562,7 +576,7 @@ retry:
inode = f2fs_iget(sb, ino);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
}
@@ -738,7 +752,8 @@ void f2fs_evict_inode(struct inode *inode)
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
- if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
+ if ((inode->i_nlink || is_bad_inode(inode)) &&
+ test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
f2fs_invalidate_compress_pages(sbi, inode->i_ino);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
@@ -754,7 +769,7 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err) {
err = 0;
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
@@ -764,7 +779,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- sb_start_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -795,7 +811,8 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- sb_end_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
@@ -868,9 +885,10 @@ void f2fs_handle_failed_inode(struct inode *inode)
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power-off.
*/
- err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
+ err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index cdcf54ae0db8..be599f31d3c4 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -92,7 +92,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
- spin_lock_irq(&sbi->iostat_lat_lock);
+ spin_lock_bh(&sbi->iostat_lat_lock);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
@@ -106,7 +106,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
io_lat->bio_cnt[idx][io] = 0;
}
}
- spin_unlock_irq(&sbi->iostat_lat_lock);
+ spin_unlock_bh(&sbi->iostat_lat_lock);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
@@ -120,9 +120,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
return;
/* Need double check under the lock */
- spin_lock(&sbi->iostat_lock);
+ spin_lock_bh(&sbi->iostat_lock);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock(&sbi->iostat_lock);
+ spin_unlock_bh(&sbi->iostat_lock);
return;
}
sbi->iostat_next_period = jiffies +
@@ -133,7 +133,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
- spin_unlock(&sbi->iostat_lock);
+ spin_unlock_bh(&sbi->iostat_lock);
trace_f2fs_iostat(sbi, iostat_diff);
@@ -145,16 +145,16 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
- spin_lock(&sbi->iostat_lock);
+ spin_lock_bh(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
- spin_unlock(&sbi->iostat_lock);
+ spin_unlock_bh(&sbi->iostat_lock);
- spin_lock_irq(&sbi->iostat_lat_lock);
+ spin_lock_bh(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
- spin_unlock_irq(&sbi->iostat_lat_lock);
+ spin_unlock_bh(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
@@ -163,19 +163,16 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (!sbi->iostat_enable)
return;
- spin_lock(&sbi->iostat_lock);
+ spin_lock_bh(&sbi->iostat_lock);
sbi->rw_iostat[type] += io_bytes;
- if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
- sbi->rw_iostat[APP_BUFFERED_IO] =
- sbi->rw_iostat[APP_WRITE_IO] -
- sbi->rw_iostat[APP_DIRECT_IO];
+ if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
+ sbi->rw_iostat[APP_WRITE_IO] += io_bytes;
- if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
- sbi->rw_iostat[APP_BUFFERED_READ_IO] =
- sbi->rw_iostat[APP_READ_IO] -
- sbi->rw_iostat[APP_DIRECT_READ_IO];
- spin_unlock(&sbi->iostat_lock);
+ if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
+ sbi->rw_iostat[APP_READ_IO] += io_bytes;
+
+ spin_unlock_bh(&sbi->iostat_lock);
f2fs_record_iostat(sbi);
}
@@ -185,7 +182,6 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
{
unsigned long ts_diff;
unsigned int iotype = iostat_ctx->type;
- unsigned long flags;
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
@@ -206,12 +202,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
idx = WRITE_ASYNC_IO;
}
- spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
+ spin_lock_bh(&sbi->iostat_lat_lock);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
- spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
+ spin_unlock_bh(&sbi->iostat_lat_lock);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 9c528e583c9d..5ed79b29999f 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,7 +22,8 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
@@ -46,7 +47,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
@@ -67,14 +68,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
else
- F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
F2FS_DEF_PROJID);
err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
if (err)
goto fail_drop;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto fail_drop;
@@ -196,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
int i, cold_count, hot_count;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
@@ -206,7 +207,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
break;
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (i == cold_count + hot_count)
return;
@@ -299,19 +300,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
(!ext_cnt && !noext_cnt))
return;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i], false)) {
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
return;
}
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
for (i = 0; i < noext_cnt; i++) {
if (is_extension_exist(name, noext[i], false)) {
@@ -345,11 +346,11 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -404,7 +405,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
F2FS_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -460,7 +461,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -561,7 +562,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out_iput;
}
out_splice:
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
@@ -598,10 +599,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
goto fail;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto fail;
@@ -622,7 +623,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
f2fs_delete_entry(de, page, dir, inode);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -675,11 +676,11 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -746,18 +747,18 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@@ -803,11 +804,11 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -834,18 +835,19 @@ out:
return err;
}
-static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+ struct inode **whiteout)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -909,20 +911,22 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
}
-static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
+ struct inode *dir, struct inode **whiteout)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
return -EIO;
- return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+ return __f2fs_tmpfile(mnt_userns, dir, NULL,
+ S_IFCHR | WHITEOUT_MODE, whiteout);
}
-static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = d_inode(old_dentry);
@@ -960,21 +964,21 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
+ err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout);
if (err)
return err;
}
- err = dquot_initialize(old_dir);
+ err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
- err = dquot_initialize(new_dir);
+ err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
if (new_inode) {
- err = dquot_initialize(new_inode);
+ err = f2fs_dquot_initialize(new_inode);
if (err)
goto out;
}
@@ -1023,11 +1027,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
f2fs_add_orphan_inode(new_inode);
@@ -1048,13 +1052,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, true);
}
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
@@ -1107,8 +1111,7 @@ out_dir:
out_old:
f2fs_put_page(old_page, 0);
out:
- if (whiteout)
- iput(whiteout);
+ iput(whiteout);
return err;
}
@@ -1138,11 +1141,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
F2FS_I(new_dentry->d_inode)->i_projid)))
return -EXDEV;
- err = dquot_initialize(old_dir);
+ err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
- err = dquot_initialize(new_dir);
+ err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
@@ -1214,38 +1217,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update directory entry info of old dir inode */
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
- down_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
- up_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(old_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
file_lost_pino(new_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(new_inode, old_dir->i_ino);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
- down_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
- up_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(new_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(new_dir, false);
@@ -1300,7 +1303,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return f2fs_rename(mnt_userns, old_dir, old_dentry,
+ new_dir, new_dentry, flags);
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e863136081b4..c45d341dcf6e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -8,7 +8,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/mpage.h>
-#include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/swap.h>
@@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -430,11 +430,15 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *new, *e;
+ /* Let's mitigate lock contention of nat_tree_lock during checkpoint */
+ if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
+ return;
+
new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e)
e = __init_nat_entry(nm_i, new, ne, false);
@@ -443,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
nat_get_blkaddr(e) !=
le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
if (e != new)
__free_nat_entry(new);
}
@@ -455,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = __init_nat_entry(nm_i, new, NULL, true);
@@ -504,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -512,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
+ if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
return 0;
spin_lock(&nm_i->nat_list_lock);
@@ -534,12 +538,12 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&nm_i->nat_list_lock);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
- struct node_info *ni)
+ struct node_info *ni, bool checkpoint_context)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -556,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
ni->nid = nid;
retry:
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return 0;
}
@@ -572,10 +576,11 @@ retry:
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
* while not bothering checkpoint.
*/
- if (!rwsem_is_locked(&sbi->cp_global_sem)) {
+ if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
down_read(&curseg->journal_rwsem);
- } else if (!down_read_trylock(&curseg->journal_rwsem)) {
- up_read(&nm_i->nat_tree_lock);
+ } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
+ !down_read_trylock(&curseg->journal_rwsem)) {
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto retry;
}
@@ -584,15 +589,15 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
page = f2fs_get_meta_page(sbi, index);
if (IS_ERR(page))
@@ -887,7 +892,7 @@ static int truncate_node(struct dnode_of_data *dn)
int err;
pgoff_t index;
- err = f2fs_get_node_info(sbi, dn->nid, &ni);
+ err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
if (err)
return err;
@@ -1286,7 +1291,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
goto fail;
#ifdef CONFIG_F2FS_CHECK_FS
- err = f2fs_get_node_info(sbi, dn->nid, &new_ni);
+ err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false);
if (err) {
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
@@ -1348,7 +1353,7 @@ static int read_node_page(struct page *page, int op_flags)
return LOCKED_PAGE;
}
- err = f2fs_get_node_info(sbi, page->index, &ni);
+ err = f2fs_get_node_info(sbi, page->index, &ni, false);
if (err)
return err;
@@ -1443,6 +1448,7 @@ page_hit:
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EINVAL;
out_err:
ClearPageUptodate(page);
@@ -1599,21 +1605,21 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
- if (f2fs_get_node_info(sbi, nid, &ni))
+ if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
goto redirty_out;
if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
+ if (!f2fs_down_read_trylock(&sbi->node_write))
goto redirty_out;
} else {
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
}
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
@@ -1621,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto redirty_out;
}
@@ -1642,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
@@ -1776,6 +1782,7 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
+ percpu_counter_inc(&sbi->rf_node_block_count);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
@@ -2105,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -2126,23 +2137,24 @@ skip_write:
return 0;
}
-static int f2fs_set_node_page_dirty(struct page *page)
+static bool f2fs_dirty_node_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, NODE);
+ trace_f2fs_set_page_dirty(&folio->page, NODE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
#ifdef CONFIG_F2FS_CHECK_FS
- if (IS_INODE(page))
- f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+ if (IS_INODE(&folio->page))
+ f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
#endif
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- set_page_private_reference(page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -2151,8 +2163,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
const struct address_space_operations f2fs_node_aops = {
.writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
- .set_page_dirty = f2fs_set_node_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_node_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -2219,14 +2231,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
unsigned int i;
bool ret = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
ret = false;
break;
}
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return ret;
}
@@ -2409,7 +2421,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
unsigned int i, idx;
nid_t nid;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap))
@@ -2432,7 +2444,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
out:
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
@@ -2467,7 +2479,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
@@ -2482,7 +2494,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
}
if (ret) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2502,7 +2514,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
/* find free nids from current sum_pages */
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -2700,7 +2712,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
goto recover_xnid;
/* 1: invalidate the previous xattr nid */
- err = f2fs_get_node_info(sbi, prev_xnid, &ni);
+ err = f2fs_get_node_info(sbi, prev_xnid, &ni, false);
if (err)
return err;
@@ -2740,7 +2752,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
struct page *ipage;
int err;
- err = f2fs_get_node_info(sbi, ino, &old_ni);
+ err = f2fs_get_node_info(sbi, ino, &old_ni, false);
if (err)
return err;
@@ -2749,7 +2761,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage) {
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
@@ -2947,7 +2959,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
unsigned int valid = 0, nid_ofs = 0;
@@ -2967,7 +2979,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
__update_nat_bits(nm_i, nat_ofs, valid);
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3065,15 +3077,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* nat_cnt[DIRTY_NAT].
*/
if (cpc->reason & CP_UMOUNT) {
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -3102,7 +3114,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
return err;
@@ -3212,6 +3224,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
+ nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
@@ -3222,7 +3235,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ init_f2fs_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -3328,7 +3341,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -3358,7 +3371,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
if (nm_i->free_nid_bitmap) {
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index ff14a6e5ac1c..4c1d34bfea78 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,6 +31,9 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
+/* control total # of node writes used for roll-fowrad recovery */
+#define DEF_RF_NODE_BLOCKS 0
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
@@ -138,11 +141,6 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
}
-static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
-{
- return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
-}
-
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 04655511d7f5..3cb7f8a43b4d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -8,6 +8,7 @@
#include <asm/unaligned.h>
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/sched/mm.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
@@ -45,7 +46,7 @@
static struct kmem_cache *fsync_entry_slab;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -55,6 +56,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
+ if (NM_I(sbi)->max_rf_node_blocks &&
+ percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
+ NM_I(sbi)->max_rf_node_blocks)
+ return false;
return true;
}
@@ -81,7 +86,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto err_out;
@@ -148,7 +153,7 @@ static int init_recovered_filename(const struct inode *dir,
if (err)
return err;
f2fs_hash_filename(dir, fname);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* Case-sensitive match is fine for recovery */
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -203,7 +208,7 @@ retry:
goto out_put;
}
- err = dquot_initialize(einode);
+ err = f2fs_dquot_initialize(einode);
if (err) {
iput(einode);
goto out_put;
@@ -342,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page)
return 0;
}
+static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
+ unsigned int ra_blocks, unsigned int blkaddr,
+ unsigned int next_blkaddr)
+{
+ if (blkaddr + 1 == next_blkaddr)
+ ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
+ ra_blocks * 2);
+ else if (next_blkaddr % sbi->blocks_per_seg)
+ ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
+ ra_blocks / 2);
+ return ra_blocks;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
@@ -349,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct page *page = NULL;
block_t blkaddr;
unsigned int loop_cnt = 0;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
valid_user_blocks(sbi);
int err = 0;
@@ -423,11 +442,14 @@ next:
break;
}
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
return err;
}
@@ -508,7 +530,7 @@ got_it:
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = dquot_initialize(inode);
+ ret = f2fs_dquot_initialize(inode);
if (ret) {
iput(inode);
return ret;
@@ -587,7 +609,7 @@ retry_dn:
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry_dn;
}
goto out;
@@ -595,7 +617,7 @@ retry_dn:
f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
- err = f2fs_get_node_info(sbi, dn.nid, &ni);
+ err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
if (err)
goto err;
@@ -670,8 +692,7 @@ retry_prev:
err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto retry_prev;
}
goto err;
@@ -704,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct page *page = NULL;
int err = 0;
block_t blkaddr;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -715,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
-
page = f2fs_get_tmp_page(sbi, blkaddr);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -759,9 +779,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
f2fs_allocate_new_segments(sbi);
@@ -787,8 +812,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= SB_ACTIVE;
/* Turn on quotas so that they are updated correctly */
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
#endif
@@ -798,7 +821,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@@ -816,10 +839,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
- else {
- /* restore s_flags to let iput() trash data */
- sbi->sb->s_flags = s_flags;
- }
+ else
+ f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
skip:
fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
@@ -849,7 +870,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a135d2247415..bd9731cdec56 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -9,12 +9,14 @@
#include <linux/f2fs_fs.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include <linux/prefetch.h>
#include <linux/kthread.h>
#include <linux/swap.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/random.h>
#include "f2fs.h"
#include "segment.h"
@@ -244,16 +246,14 @@ retry:
LOOKUP_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
- cond_resched();
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
err = -EAGAIN;
goto next;
}
- err = f2fs_get_node_info(sbi, dn.nid, &ni);
+ err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
if (err) {
f2fs_put_dnode(&dn);
return err;
@@ -313,8 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
- cond_resched();
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
if (gc_failure) {
if (++looped >= count)
return;
@@ -423,9 +422,7 @@ retry:
err = f2fs_do_write_data_page(&fio);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
- cond_resched();
+ memalloc_retry_wait(GFP_NOFS);
goto retry;
}
unlock_page(page);
@@ -473,7 +470,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
f2fs_balance_fs(sbi, true);
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
f2fs_lock_op(sbi);
set_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -485,7 +482,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
clear_inode_flag(inode, FI_ATOMIC_COMMIT);
f2fs_unlock_op(sbi);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
return err;
}
@@ -523,12 +520,31 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_gc(sbi, false, false, false, NULL_SEGNO);
}
}
}
+static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
+{
+ int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
+ unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
+ unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
+ unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
+ unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
+ unsigned int threshold = sbi->blocks_per_seg * factor *
+ DEFAULT_DIRTY_THRESHOLD;
+ unsigned int global_threshold = threshold * 3 / 2;
+
+ if (dents >= threshold || qdata >= threshold ||
+ nodes >= threshold || meta >= threshold ||
+ imeta >= threshold)
+ return true;
+ return dents + qdata + nodes + meta + imeta > global_threshold;
+}
+
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
{
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -547,13 +563,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
else
f2fs_build_free_nids(sbi, false, false);
- if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) ||
- excess_prefree_segs(sbi))
+ if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) ||
+ excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi))
goto do_sync;
/* there is background inflight IO or foreground operation recently */
if (is_inflight_io(sbi, REQ_TIME) ||
- (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
+ (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem)))
return;
/* exceed periodical checkpoint timeout threshold */
@@ -561,7 +577,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
goto do_sync;
/* checkpoint is the only way to shrink partial cached entries */
- if (f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
+ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) &&
f2fs_available_free_memory(sbi, INO_ENTRIES))
return;
@@ -786,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
do {
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
if (ret)
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (ret && --count);
if (ret) {
@@ -1139,14 +1154,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->ordered = false;
dpolicy->granularity = granularity;
- dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+ dpolicy->max_requests = dcc->max_discard_request;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
@@ -1154,12 +1169,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->granularity = 1;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
- DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->min_discard_issue_time;
}
} else if (discard_type == DPOLICY_FORCE) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
@@ -1764,7 +1779,7 @@ static int issue_discard_thread(void *data)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
wait_queue_head_t *q = &dcc->discard_wait_queue;
struct discard_policy dpolicy;
- unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ unsigned int wait_ms = dcc->min_discard_issue_time;
int issued;
set_freezable();
@@ -2163,6 +2178,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
+ dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
+ dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2538,8 +2557,8 @@ find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
if (secno >= MAIN_SECS(sbi)) {
if (dir == ALLOC_RIGHT) {
- secno = find_next_zero_bit(free_i->free_secmap,
- MAIN_SECS(sbi), 0);
+ secno = find_first_zero_bit(free_i->free_secmap,
+ MAIN_SECS(sbi));
f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
} else {
go_left = 1;
@@ -2554,8 +2573,8 @@ find_other_zone:
left_start--;
continue;
}
- left_start = find_next_zero_bit(free_i->free_secmap,
- MAIN_SECS(sbi), 0);
+ left_start = find_first_zero_bit(free_i->free_secmap,
+ MAIN_SECS(sbi));
f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
break;
}
@@ -2630,6 +2649,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
unsigned short seg_type = curseg->seg_type;
sanity_check_seg_type(sbi, seg_type);
+ if (f2fs_need_rand_seg(sbi))
+ return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2681,6 +2702,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
curseg->alloc_type = LFS;
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
+ curseg->fragment_remained_chunk =
+ prandom_u32() % sbi->max_fragment_chunk + 1;
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2707,12 +2731,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
struct curseg_info *seg)
{
- if (seg->alloc_type == SSR)
+ if (seg->alloc_type == SSR) {
seg->next_blkoff =
__next_free_blkoff(sbi, seg->segno,
seg->next_blkoff + 1);
- else
+ } else {
seg->next_blkoff++;
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) {
+ /* To allocate block chunks in different sizes, use random number */
+ if (--seg->fragment_remained_chunk <= 0) {
+ seg->fragment_remained_chunk =
+ prandom_u32() % sbi->max_fragment_chunk + 1;
+ seg->next_blkoff +=
+ prandom_u32() % sbi->max_fragment_hole + 1;
+ }
+ }
+ }
}
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
@@ -2789,7 +2823,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
if (!sbi->am.atgc_enabled)
return;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -2799,7 +2833,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
@@ -2950,7 +2984,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -2974,7 +3008,7 @@ unlock:
type, segno, curseg->segno);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
@@ -3006,23 +3040,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi,
void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
__allocate_new_section(sbi, type, force);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static const struct segment_allocation default_salloc_ops = {
@@ -3101,7 +3135,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -3160,9 +3194,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
goto out;
@@ -3209,101 +3243,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
-/* This returns write hints for each segment type. This hints will be
- * passed down to block layer. There are mapping tables which depend on
- * the mount option 'whint_mode'.
- *
- * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
- *
- * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_NOT_SET
- * HOT_NODE "
- * WARM_NODE "
- * COLD_NODE "
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- *
- * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_MEDIUM;
- * HOT_NODE WRITE_LIFE_NOT_SET
- * WARM_NODE "
- * COLD_NODE WRITE_LIFE_NONE
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- */
-
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp)
-{
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_NOT_SET;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else {
- return WRITE_LIFE_NOT_SET;
- }
- } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_LONG;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else if (type == NODE) {
- if (temp == WARM || temp == HOT)
- return WRITE_LIFE_NOT_SET;
- else if (temp == COLD)
- return WRITE_LIFE_NONE;
- } else if (type == META) {
- return WRITE_LIFE_MEDIUM;
- }
- }
- return WRITE_LIFE_NOT_SET;
-}
-
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -3399,7 +3338,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
@@ -3482,27 +3421,33 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
-static void update_device_state(struct f2fs_io_info *fio)
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+ block_t blkaddr, unsigned int blkcnt)
{
- struct f2fs_sb_info *sbi = fio->sbi;
- unsigned int devidx;
-
if (!f2fs_is_multi_device(sbi))
return;
- devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+ while (1) {
+ unsigned int devidx = f2fs_target_device_index(sbi, blkaddr);
+ unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1;
+
+ /* update device state for fsync */
+ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO);
- /* update device state for fsync */
- f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+ /* update device state for checkpoint */
+ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+ spin_lock(&sbi->dev_lock);
+ f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+ spin_unlock(&sbi->dev_lock);
+ }
- /* update device state for checkpoint */
- if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
- spin_lock(&sbi->dev_lock);
- f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
- spin_unlock(&sbi->dev_lock);
+ if (blkcnt <= blks)
+ break;
+ blkcnt -= blks;
+ blkaddr += blks;
}
}
@@ -3512,7 +3457,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
- down_read(&fio->sbi->io_order_lock);
+ f2fs_down_read(&fio->sbi->io_order_lock);
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
@@ -3529,10 +3474,10 @@ reallocate:
goto reallocate;
}
- update_device_state(fio);
+ f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
- up_read(&fio->sbi->io_order_lock);
+ f2fs_up_read(&fio->sbi->io_order_lock);
}
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -3611,6 +3556,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ fio->new_blkaddr, fio->new_blkaddr);
+
stat_inc_inplace_blocks(fio->sbi);
if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE)))
@@ -3618,7 +3566,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
else
err = f2fs_submit_page_bio(fio);
if (!err) {
- update_device_state(fio);
+ f2fs_update_device_state(fio->sbi, fio->ino,
+ fio->new_blkaddr, 1);
f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
}
@@ -3663,7 +3612,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
se = get_seg_entry(sbi, segno);
type = se->type;
- down_write(&SM_I(sbi)->curseg_lock);
+ f2fs_down_write(&SM_I(sbi)->curseg_lock);
if (!recover_curseg) {
/* for recovery flow */
@@ -3732,7 +3681,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_write(&SM_I(sbi)->curseg_lock);
+ f2fs_up_write(&SM_I(sbi)->curseg_lock);
}
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -4747,6 +4696,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
sanity_check_seg_type(sbi, curseg->seg_type);
+ if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
+ f2fs_err(sbi,
+ "Current segment has invalid alloc_type:%d",
+ curseg->alloc_type);
+ return -EFSCORRUPTED;
+ }
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
@@ -5216,7 +5172,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sm_info->sit_entry_set);
- init_rwsem(&sm_info->curseg_lock);
+ init_f2fs_rwsem(&sm_info->curseg_lock);
if (!f2fs_readonly(sbi->sb)) {
err = f2fs_create_flush_cmd_control(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 89fff258727d..5c94caf0c0a1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -314,6 +314,7 @@ struct curseg_info {
unsigned short next_blkoff; /* next block offset to write */
unsigned int zone; /* current zone number */
unsigned int next_segno; /* preallocated segment */
+ int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */
bool inited; /* indicate inmem log is inited */
};
@@ -537,7 +538,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi)
{
- return SM_I(sbi)->reserved_segments;
+ return SM_I(sbi)->reserved_segments +
+ SM_I(sbi)->additional_reserved_segments;
}
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
@@ -649,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* pages over min_fsync_blocks. (=default option)
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
* F2FS_IPU_NOCACHE - disable IPU bio cache.
- * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
+ * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
+ * FI_OPU_WRITE flag.
+ * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
@@ -665,6 +669,7 @@ enum {
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
+ F2FS_IPU_HONOR_OPU_WRITE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 78ebc306ee2b..4368f90571bd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -8,9 +8,9 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/buffer_head.h>
-#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/parser.h>
#include <linux/mount.h>
@@ -58,6 +58,8 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_DISCARD] = "discard error",
[FAULT_WRITE_IO] = "write IO error",
[FAULT_SLAB_ALLOC] = "slab alloc",
+ [FAULT_DQUOT_INIT] = "dquot initialize",
+ [FAULT_LOCK_OP] = "lock_op",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -136,7 +138,6 @@ enum {
Opt_jqfmt_vfsold,
Opt_jqfmt_vfsv0,
Opt_jqfmt_vfsv1,
- Opt_whint,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -212,7 +213,6 @@ static match_table_t f2fs_tokens = {
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_whint, "whint_mode=%s"},
{Opt_alloc, "alloc_mode=%s"},
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
@@ -255,33 +255,26 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
va_end(args);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct f2fs_sb_encodings {
__u16 magic;
char *name;
- char *version;
+ unsigned int version;
} f2fs_sb_encoding_map[] = {
- {F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"},
+ {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};
-static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb,
- const struct f2fs_sb_encodings **encoding,
- __u16 *flags)
+static const struct f2fs_sb_encodings *
+f2fs_sb_read_encoding(const struct f2fs_super_block *sb)
{
__u16 magic = le16_to_cpu(sb->s_encoding);
int i;
for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++)
if (magic == f2fs_sb_encoding_map[i].magic)
- break;
-
- if (i >= ARRAY_SIZE(f2fs_sb_encoding_map))
- return -EINVAL;
+ return &f2fs_sb_encoding_map[i];
- *encoding = &f2fs_sb_encoding_map[i];
- *flags = le16_to_cpu(sb->s_encoding_flags);
-
- return 0;
+ return NULL;
}
struct kmem_cache *f2fs_cf_name_slab;
@@ -327,6 +320,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).s_resgid));
}
+static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi)
+{
+ unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec;
+ unsigned int avg_vblocks;
+ unsigned int wanted_reserved_segments;
+ block_t avail_user_block_count;
+
+ if (!F2FS_IO_ALIGNED(sbi))
+ return 0;
+
+ /* average valid block count in section in worst case */
+ avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi);
+
+ /*
+ * we need enough free space when migrating one section in worst case
+ */
+ wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) *
+ reserved_segments(sbi);
+ wanted_reserved_segments -= reserved_segments(sbi);
+
+ avail_user_block_count = sbi->user_block_count -
+ sbi->current_reserved_blocks -
+ F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (wanted_reserved_segments * sbi->blocks_per_seg >
+ avail_user_block_count) {
+ f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u",
+ wanted_reserved_segments,
+ avail_user_block_count >> sbi->log_blocks_per_seg);
+ return -ENOSPC;
+ }
+
+ SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments;
+
+ f2fs_info(sbi, "IO align feature needs additional reserved segment: %u",
+ wanted_reserved_segments);
+
+ return 0;
+}
+
static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
{
if (!F2FS_OPTION(sbi).unusable_cap_perc)
@@ -592,7 +625,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
if (kstrtouint(str + 1, 10, &level))
return -EINVAL;
- if (!level || level > ZSTD_maxCLevel()) {
+ if (!level || level > zstd_max_clevel()) {
f2fs_info(sbi, "invalid zstd compress level: %d", level);
return -EINVAL;
}
@@ -817,6 +850,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (!strcmp(name, "lfs")) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
+ } else if (!strcmp(name, "fragment:segment")) {
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG;
+ } else if (!strcmp(name, "fragment:block")) {
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK;
} else {
kfree(name);
return -EINVAL;
@@ -936,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "quota operations not supported");
break;
#endif
- case Opt_whint:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "user-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
- } else if (!strcmp(name, "off")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
- } else if (!strcmp(name, "fs-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
case Opt_alloc:
name = match_strdup(&args[0]);
if (!name)
@@ -1220,7 +1241,7 @@ default_check:
return -EINVAL;
}
#endif
-#ifndef CONFIG_UNICODE
+#if !IS_ENABLED(CONFIG_UNICODE)
if (f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
"Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
@@ -1289,12 +1310,6 @@ default_check:
return -EINVAL;
}
- /* Not pass down write hints if the number of active logs is lesser
- * than NR_CURSEG_PERSIST_TYPE.
- */
- if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
-
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1306,8 +1321,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
- GFP_F2FS_ZERO, false, F2FS_SB(sb));
+ if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) {
+ f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC);
+ return NULL;
+ }
+
+ fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO);
if (!fi)
return NULL;
@@ -1316,16 +1335,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
- init_rwsem(&fi->i_sem);
+ init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
- init_rwsem(&fi->i_gc_rwsem[READ]);
- init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_xattr_sem);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
+ init_f2fs_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -1462,8 +1481,9 @@ static void f2fs_free_inode(struct inode *inode)
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
}
static void destroy_device_list(struct f2fs_sb_info *sbi)
@@ -1580,7 +1600,7 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_iostat(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
@@ -1623,11 +1643,15 @@ static int f2fs_freeze(struct super_block *sb)
/* ensure no checkpoint required */
if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
return -EINVAL;
+
+ /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
{
+ clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
@@ -1896,6 +1920,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, "adaptive");
else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
seq_puts(seq, "lfs");
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG)
+ seq_puts(seq, "fragment:segment");
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
+ seq_puts(seq, "fragment:block");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
@@ -1926,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",prjquota");
#endif
f2fs_show_quota_options(seq, sbi->sb);
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
- seq_printf(seq, ",whint_mode=%s", "user-based");
- else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
- seq_printf(seq, ",whint_mode=%s", "fs-based");
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
@@ -1981,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
@@ -2032,6 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
+ unsigned int gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2044,8 +2068,11 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_update_time(sbi, DISABLE_TIME);
+ gc_mode = sbi->gc_mode;
+ sbi->gc_mode = GC_URGENT_HIGH;
+
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
@@ -2067,7 +2094,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
err = f2fs_write_checkpoint(sbi, &cpc);
@@ -2079,8 +2106,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->stat_lock);
out_unlock:
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
restore_flag:
+ sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -2092,19 +2120,18 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* we should flush all the data to keep data consistency */
do {
sync_inodes_sb(sbi->sb);
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
if (unlikely(retry < 0))
f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
}
@@ -2258,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY ||
- F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
+ if (*flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2406,8 +2432,7 @@ repeat:
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ memalloc_retry_wait(GFP_NOFS);
goto repeat;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -2462,8 +2487,7 @@ retry:
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -2491,6 +2515,16 @@ retry:
return len - towrite;
}
+int f2fs_dquot_initialize(struct inode *inode)
+{
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) {
+ f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT);
+ return -ESRCH;
+ }
+
+ return dquot_initialize(inode);
+}
+
static struct dquot **f2fs_get_dquots(struct inode *inode)
{
return F2FS_I(inode)->i_dquot;
@@ -2636,7 +2670,7 @@ int f2fs_quota_sync(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
- int ret;
+ int ret = 0;
/*
* Now when everything is written we can discard the pagecache so
@@ -2647,26 +2681,26 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, type))
- return 0;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
* f2fs_quota_sync
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
* dquot_writeback_dquots()
* f2fs_dquot_commit
* block_operation
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
*/
f2fs_lock_op(sbi);
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = f2fs_quota_sync_file(sbi, cnt);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
inode_unlock(dqopt->files[cnt]);
@@ -2791,11 +2825,11 @@ static int f2fs_dquot_commit(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
+ f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
ret = dquot_commit(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -2804,11 +2838,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = dquot_acquire(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -2875,6 +2909,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = {
.get_nextdqblk = dquot_get_next_dqblk,
};
#else
+int f2fs_dquot_initialize(struct inode *inode)
+{
+ return 0;
+}
+
int f2fs_quota_sync(struct super_block *sb, int type)
{
return 0;
@@ -2976,7 +3015,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
- .max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_num_devices = f2fs_get_num_devices,
@@ -3487,7 +3525,7 @@ skip_cross:
NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) {
f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)",
cp_payload, nat_bits_blocks);
- return -EFSCORRUPTED;
+ return 1;
}
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3518,11 +3556,15 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->gc_mode = GC_NORMAL;
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
sbi->migration_granularity = sbi->segs_per_sec;
sbi->seq_file_ra_mul = MIN_RA_MUL;
+ sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
+ sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
+ spin_lock_init(&sbi->gc_urgent_high_lock);
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3542,14 +3584,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- init_rwsem(&sbi->io_order_lock);
+ init_f2fs_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
sbi->dirty_device = 0;
spin_lock_init(&sbi->dev_lock);
- init_rwsem(&sbi->sb_lock);
- init_rwsem(&sbi->pin_sem);
+ init_f2fs_rwsem(&sbi->sb_lock);
+ init_f2fs_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -3560,11 +3602,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
+ err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL);
+ if (err)
+ goto err_valid_block;
+
err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
if (err)
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ goto err_node_block;
+ return 0;
+err_node_block:
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+err_valid_block:
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
return err;
}
@@ -3747,6 +3798,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
unsigned int max_devices = MAX_DEVICES;
+ unsigned int logical_blksize;
int i;
/* Initialize single device information */
@@ -3767,6 +3819,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
if (!sbi->devs)
return -ENOMEM;
+ logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
+ sbi->aligned_blksize = true;
+
for (i = 0; i < max_devices; i++) {
if (i > 0 && !RDEV(i).path[0])
@@ -3803,6 +3858,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
/* to release errored devices */
sbi->s_ndevs = i + 1;
+ if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev))
+ sbi->aligned_blksize = false;
+
#ifdef CONFIG_BLK_DEV_ZONED
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
!f2fs_sb_has_blkzoned(sbi)) {
@@ -3837,31 +3895,38 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) {
const struct f2fs_sb_encodings *encoding_info;
struct unicode_map *encoding;
__u16 encoding_flags;
- if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info,
- &encoding_flags)) {
+ encoding_info = f2fs_sb_read_encoding(sbi->raw_super);
+ if (!encoding_info) {
f2fs_err(sbi,
"Encoding requested by superblock is unknown");
return -EINVAL;
}
+ encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags);
encoding = utf8_load(encoding_info->version);
if (IS_ERR(encoding)) {
f2fs_err(sbi,
- "can't mount with superblock charset: %s-%s "
+ "can't mount with superblock charset: %s-%u.%u.%u "
"not supported by the kernel. flags: 0x%x.",
- encoding_info->name, encoding_info->version,
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
encoding_flags);
return PTR_ERR(encoding);
}
f2fs_info(sbi, "Using encoding defined by superblock: "
- "%s-%s with flags 0x%hx", encoding_info->name,
- encoding_info->version?:"\b", encoding_flags);
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
sbi->sb->s_encoding = encoding;
sbi->sb->s_encoding_flags = encoding_flags;
@@ -3884,7 +3949,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
if (f2fs_block_unit_discard(sbi))
sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ 1 << F2FS_IPU_HONOR_OPU_WRITE;
}
sbi->readdir_ra = 1;
@@ -3994,11 +4060,11 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_rwsem(&sbi->gc_lock);
+ init_f2fs_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
- init_rwsem(&sbi->cp_global_sem);
- init_rwsem(&sbi->node_write);
- init_rwsem(&sbi->node_change);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -4019,18 +4085,18 @@ try_onemore:
}
for (j = HOT; j < n; j++) {
- init_rwsem(&sbi->write_io[i][j].io_rwsem);
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
sbi->write_io[i][j].sbi = sbi;
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
- init_rwsem(&sbi->cp_rwsem);
- init_rwsem(&sbi->quota_sem);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
@@ -4148,6 +4214,10 @@ try_onemore:
goto free_nm;
}
+ err = adjust_reserved_segment(sbi);
+ if (err)
+ goto free_nm;
+
/* For write statistics */
sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
@@ -4352,6 +4422,8 @@ free_node_inode:
free_stats:
f2fs_destroy_stats(sbi);
free_nm:
+ /* stop discard thread before destroying node manager */
+ f2fs_stop_discard_thread(sbi);
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
@@ -4379,7 +4451,7 @@ free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
sb->s_encoding = NULL;
#endif
@@ -4449,7 +4521,7 @@ static struct file_system_type f2fs_fs_type = {
.name = "f2fs",
.mount = f2fs_mount,
.kill_sb = kill_f2fs_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("f2fs");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a32fe31c33b8..4c50aedd5144 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -41,6 +41,16 @@ enum {
ATGC_INFO, /* struct atgc_management */
};
+static const char *gc_mode_names[MAX_GC_MODE] = {
+ "GC_NORMAL",
+ "GC_IDLE_CB",
+ "GC_IDLE_GREEDY",
+ "GC_IDLE_AT",
+ "GC_URGENT_HIGH",
+ "GC_URGENT_LOW",
+ "GC_URGENT_MID"
+};
+
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
@@ -118,6 +128,15 @@ static ssize_t sb_status_show(struct f2fs_attr *a,
return sprintf(buf, "%lx\n", sbi->s_flag);
}
+static ssize_t pending_discard_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+ &SM_I(sbi)->dcc_info->discard_cmd_cnt));
+}
+
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -192,12 +211,11 @@ static ssize_t unusable_show(struct f2fs_attr *a,
static ssize_t encoding_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = sbi->sb;
if (f2fs_sb_has_casefold(sbi))
- return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
- sb->s_encoding->charset,
+ return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n",
(sb->s_encoding->version >> 16) & 0xff,
(sb->s_encoding->version >> 8) & 0xff,
sb->s_encoding->version & 0xff);
@@ -245,7 +263,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
static ssize_t main_blkaddr_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)MAIN_BLKADDR(sbi));
}
@@ -308,8 +326,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
+ if (!strcmp(a->attr.name, "gc_urgent"))
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_mode]);
+
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_segment_mode]);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -355,7 +378,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
@@ -365,7 +388,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (ret)
f2fs_update_extension_list(sbi, name, hot, !set);
out:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
@@ -415,7 +438,9 @@ out:
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
- F2FS_OPTION(sbi).root_reserved_blocks)) {
+ F2FS_OPTION(sbi).root_reserved_blocks -
+ sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -458,6 +483,13 @@ out:
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
+ } else if (t == 3) {
+ sbi->gc_mode = GC_URGENT_MID;
+ if (sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(
+ &sbi->gc_thread->gc_wait_queue_head);
+ }
} else {
return -EINVAL;
}
@@ -471,13 +503,22 @@ out:
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
- sbi->gc_mode = GC_AT;
+ sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
return count;
}
+ if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
+ spin_lock(&sbi->gc_urgent_high_lock);
+ sbi->gc_urgent_high_limited = t != 0;
+ sbi->gc_urgent_high_remaining = t;
+ spin_unlock(&sbi->gc_urgent_high_lock);
+
+ return count;
+ }
+
#ifdef CONFIG_F2FS_IOSTAT
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
@@ -551,6 +592,22 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_fragment_chunk")) {
+ if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
+ sbi->max_fragment_chunk = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "max_fragment_hole")) {
+ if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
+ sbi->max_fragment_hole = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -681,6 +738,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
@@ -693,6 +754,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -717,6 +779,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
@@ -728,6 +791,7 @@ F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
+F2FS_GENERAL_RO_ATTR(pending_discard);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -741,7 +805,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption);
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif
#endif /* CONFIG_FS_ENCRYPTION */
@@ -760,7 +824,7 @@ F2FS_FEATURE_RO_ATTR(lost_found);
F2FS_FEATURE_RO_ATTR(verity);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(casefold);
#endif
F2FS_FEATURE_RO_ATTR(readonly);
@@ -781,6 +845,8 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -793,7 +859,12 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(max_discard_request),
+ ATTR_LIST(min_discard_issue_time),
+ ATTR_LIST(mid_discard_issue_time),
+ ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_granularity),
+ ATTR_LIST(pending_discard),
ATTR_LIST(batched_trim_sections),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
@@ -807,6 +878,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
@@ -826,6 +898,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
+ ATTR_LIST(gc_urgent_high_remaining),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
@@ -859,6 +932,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(seq_file_ra_mul),
ATTR_LIST(gc_segment_mode),
ATTR_LIST(gc_reclaimed_segments),
+ ATTR_LIST(max_fragment_chunk),
+ ATTR_LIST(max_fragment_hole),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -867,7 +942,7 @@ static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
@@ -886,7 +961,7 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
ATTR_LIST(readonly),
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 03549b5ba204..3d793202cc9f 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -136,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
* here and not rely on ->open() doing it. This must be done before
* evicting the inline data.
*/
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -208,7 +208,7 @@ cleanup:
* from re-instantiating cached pages we are truncating (since unlike
* normal file accesses, garbage collection isn't limited by i_size).
*/
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
truncate_inode_pages(inode->i_mapping, inode->i_size);
err2 = f2fs_truncate(inode);
if (err2) {
@@ -216,7 +216,7 @@ cleanup:
err2);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return err ?: err2;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1d2d29dcd41c..c76c15086e5f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -226,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index)
}
static struct f2fs_xattr_entry *__find_xattr(void *base_addr,
- void *last_base_addr, int index,
- size_t len, const char *name)
+ void *last_base_addr, void **last_addr,
+ int index, size_t len, const char *name)
{
struct f2fs_xattr_entry *entry;
list_for_each_xattr(entry, base_addr) {
if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
- (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr)
+ (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
+ if (last_addr)
+ *last_addr = entry;
return NULL;
+ }
if (entry->e_name_index != index)
continue;
@@ -254,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
unsigned int inline_size = inline_xattr_size(inode);
void *max_addr = base_addr + inline_size;
- list_for_each_xattr(entry, base_addr) {
- if ((void *)entry + sizeof(__u32) > max_addr ||
- (void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
- *last_addr = entry;
- return NULL;
- }
- if (entry->e_name_index != index)
- continue;
- if (entry->e_name_len != len)
- continue;
- if (!memcmp(entry->e_name, name, len))
- break;
- }
+ entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name);
+ if (!entry)
+ return NULL;
/* inline xattr header or entry across max inline xattr size */
if (IS_XATTR_LAST_ENTRY(entry) &&
@@ -368,7 +361,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
else
cur_addr = txattr_addr;
- *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name);
+ *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
if (!*xe) {
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
inode->i_ino);
@@ -532,10 +525,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -569,9 +562,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error;
size_t rest = buffer_size;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -659,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
/* find entry with wanted name. */
- here = __find_xattr(base_addr, last_base_addr, index, len, name);
+ here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
inode->i_ino);
@@ -684,8 +677,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
}
last = here;
- while (!IS_XATTR_LAST_ENTRY(last))
+ while (!IS_XATTR_LAST_ENTRY(last)) {
+ if ((void *)(last) + sizeof(__u32) > last_base_addr ||
+ (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) {
+ f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu",
+ inode->i_ino, ENTRY_SIZE(last));
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ error = -EFSCORRUPTED;
+ goto exit;
+ }
last = XATTR_NEXT_ENTRY(last);
+ }
newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size);
@@ -773,7 +775,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -784,9 +786,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- down_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
- up_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c4a274285858..249825017da7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
if (name_len >= sizeof(d1->d_name)) \
name_len = sizeof(d1->d_name) - 1; \
\
- if (put_user(0, d2->d_name) || \
+ if (put_user(0, &d2->d_name[0]) || \
put_user(0, &d2->d_reclen) || \
copy_to_user(d1->d_name, name, name_len) || \
put_user(0, d1->d_name + name_len) || \
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 13855ba49cd9..a5a309fcc7fa 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -175,9 +175,10 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static int fat_file_release(struct inode *inode, struct file *filp)
{
if ((filp->f_mode & FMODE_WRITE) &&
- MSDOS_SB(inode->i_sb)->options.flush) {
+ MSDOS_SB(inode->i_sb)->options.flush) {
fat_flush_inodes(inode->i_sb, inode, NULL);
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(HZ/10);
}
return 0;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0c9b013a85..bf6051bdf1d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -342,7 +342,8 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations fat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
@@ -745,7 +746,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -1536,14 +1537,11 @@ static int fat_read_static_bpb(struct super_block *sb,
struct fat_bios_param_block *bpb)
{
static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+ sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
struct fat_floppy_defaults *fdefaults = NULL;
int error = -EINVAL;
- sector_t bd_sects;
unsigned i;
- bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
/* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
if (!silent)
@@ -1943,10 +1941,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
ret = writeback_inode(i1);
if (!ret && i2)
ret = writeback_inode(i2);
- if (!ret) {
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- ret = filemap_flush(mapping);
- }
+ if (!ret)
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..f15d885b9796 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
u64 h;
switch (cmd) {
- case F_GET_FILE_RW_HINT:
- h = file_write_hint(file);
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_FILE_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
-
- spin_lock(&file->f_lock);
- file->f_write_hint = hint;
- spin_unlock(&file->f_lock);
- return 0;
case F_GET_RW_HINT:
h = inode->i_write_hint;
if (copy_to_user(argp, &h, sizeof(*argp)))
@@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
- case F_GET_FILE_RW_HINT:
- case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
default:
diff --git a/fs/file.c b/fs/file.c
index 8627dacfc424..ee9317346702 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
@@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
+ nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
@@ -269,6 +285,19 @@ static unsigned int count_open_files(struct fdtable *fdt)
return i;
}
+/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
unsigned int count;
@@ -276,7 +305,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
count = count_open_files(fdt);
if (max_fds < NR_OPEN_DEFAULT)
max_fds = NR_OPEN_DEFAULT;
- return min(count, max_fds);
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
}
/*
@@ -841,24 +870,68 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static inline struct file *__fget_files_rcu(struct files_struct *files,
+ unsigned int fd, fmode_t mask, unsigned int refs)
+{
+ for (;;) {
+ struct file *file;
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+ struct file __rcu **fdentry;
+
+ if (unlikely(fd >= fdt->max_fds))
+ return NULL;
+
+ fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+ file = rcu_dereference_raw(*fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(file->f_mode & mask))
+ return NULL;
+
+ /*
+ * Ok, we have a file pointer. However, because we do
+ * this all locklessly under RCU, we may be racing with
+ * that file being closed.
+ *
+ * Such a race can take two forms:
+ *
+ * (a) the file ref already went down to zero,
+ * and get_file_rcu_many() fails. Just try
+ * again:
+ */
+ if (unlikely(!get_file_rcu_many(file, refs)))
+ continue;
+
+ /*
+ * (b) the file table entry has changed under us.
+ * Note that we don't need to re-check the 'fdt->fd'
+ * pointer having changed, because it always goes
+ * hand-in-hand with 'fdt'.
+ *
+ * If so, we need to put our refs and try again.
+ */
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
+ unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ fput_many(file, refs);
+ continue;
+ }
+
+ /*
+ * Ok, we have a ref to the file, and checked that it
+ * still exists.
+ */
+ return file;
+ }
+}
+
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
fmode_t mask, unsigned int refs)
{
struct file *file;
rcu_read_lock();
-loop:
- file = files_lookup_fd_rcu(files, fd);
- if (file) {
- /* File object ref couldn't be taken.
- * dup2() atomicity guarantee is the reason
- * we loop to catch the new file (or NULL pointer)
- */
- if (file->f_mode & mask)
- file = NULL;
- else if (!get_file_rcu_many(file, refs))
- goto loop;
- }
+ file = __fget_files_rcu(files, fd, mask, refs);
rcu_read_unlock();
return file;
diff --git a/fs/file_table.c b/fs/file_table.c
index 45437f8e1003..ada8fe814db9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -27,13 +27,14 @@
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>
+#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include "internal.h"
/* sysctl tunables... */
-struct files_stat_struct files_stat = {
+static struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
@@ -75,22 +76,58 @@ unsigned long get_max_files(void)
}
EXPORT_SYMBOL_GPL(get_max_files);
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+
/*
* Handle nr_files sysctl
*/
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_nr_files(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
-#else
-int proc_nr_files(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+
+static struct ctl_table fs_stat_sysctls[] = {
+ {
+ .procname = "file-nr",
+ .data = &files_stat,
+ .maxlen = sizeof(files_stat),
+ .mode = 0444,
+ .proc_handler = proc_nr_files,
+ },
+ {
+ .procname = "file-max",
+ .data = &files_stat.max_files,
+ .maxlen = sizeof(files_stat.max_files),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = SYSCTL_LONG_ZERO,
+ .extra2 = SYSCTL_LONG_MAX,
+ },
+ {
+ .procname = "nr_open",
+ .data = &sysctl_nr_open,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &sysctl_nr_open_min,
+ .extra2 = &sysctl_nr_open_max,
+ },
+ { }
+};
+
+static int __init init_fs_stat_sysctls(void)
{
- return -ENOSYS;
+ register_sysctl_init("fs", fs_stat_sysctls);
+ if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
+ struct ctl_table_header *hdr;
+ hdr = register_sysctl_mount_point("fs/binfmt_misc");
+ kmemleak_not_leak(hdr);
+ }
+ return 0;
}
+fs_initcall(init_fs_stat_sysctls);
#endif
static struct file *__alloc_file(int flags, const struct cred *cred)
@@ -375,6 +412,7 @@ void __fput_sync(struct file *file)
}
EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 578a5062706e..22eed5a73ac2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb)
{
struct vxfs_inode_info *vi;
- vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
+ vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
inode_init_once(&vi->vfs_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..1fae0196292a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -372,7 +372,7 @@ static bool inode_do_switch_wbs(struct inode *inode,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
bool switched = false;
spin_lock(&inode->i_lock);
@@ -389,21 +389,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
- * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
- * pages actually under writeback.
+ * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
+ * folios actually under writeback.
*/
- xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
- if (PageDirty(page)) {
- dec_wb_stat(old_wb, WB_RECLAIMABLE);
- inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
+ if (folio_test_dirty(folio)) {
+ long nr = folio_nr_pages(folio);
+ wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
+ wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
}
}
xas_set(&xas, 0);
- xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
- WARN_ON_ONCE(!PageWriteback(page));
- dec_wb_stat(old_wb, WB_WRITEBACK);
- inc_wb_stat(new_wb, WB_WRITEBACK);
+ xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
+ long nr = folio_nr_pages(folio);
+ WARN_ON_ONCE(!folio_test_writeback(folio));
+ wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
+ wb_stat_mod(new_wb, WB_WRITEBACK, nr);
}
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
@@ -566,7 +568,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
+ isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
if (!isw)
return;
@@ -624,8 +626,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
int nr;
bool restart = false;
- isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
- sizeof(struct inode *), GFP_KERNEL);
+ isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
+ GFP_KERNEL);
if (!isw)
return restart;
@@ -892,43 +894,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/**
- * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion (may be NULL)
- * @cong_bits: mask of WB_[a]sync_congested bits to test
- *
- * Tests whether @inode is congested. @cong_bits is the mask of congestion
- * bits to test and the return value is the mask of set bits.
- *
- * If cgroup writeback is enabled for @inode, the congestion state is
- * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- * associated with @inode is congested; otherwise, the root wb's congestion
- * state is used.
- *
- * @inode is allowed to be NULL as this function is often called on
- * mapping->host which is NULL for the swapper space.
- */
-int inode_congested(struct inode *inode, int cong_bits)
-{
- /*
- * Once set, ->i_wb never becomes NULL while the inode is alive.
- * Start transaction iff ->i_wb is visible.
- */
- if (inode && inode_to_wb_is_valid(inode)) {
- struct bdi_writeback *wb;
- struct wb_lock_cookie lock_cookie = {};
- bool congested;
-
- wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
- congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, &lock_cookie);
- return congested;
- }
-
- return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-EXPORT_SYMBOL_GPL(inode_congested);
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -1666,6 +1631,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
+ else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ if (!(inode->i_state & I_DIRTY_PAGES)) {
+ inode->i_state &= ~I_PINNING_FSCACHE_WB;
+ wbc->unpinned_fscache_wb = true;
+ dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+ }
+ }
spin_unlock(&inode->i_lock);
@@ -1675,6 +1647,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (ret == 0)
ret = err;
}
+ wbc->unpinned_fscache_wb = false;
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
@@ -1739,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED) &&
+ (inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1893,7 +1870,7 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2223,7 +2200,6 @@ void wb_workfn(struct work_struct *work)
long pages_written;
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
- current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(WB_registered, &wb->state))) {
@@ -2252,8 +2228,6 @@ void wb_workfn(struct work_struct *work)
wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
-
- current->flags &= ~PF_SWAPWRITE;
}
/*
@@ -2290,8 +2264,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
/*
* If we are expecting writeback progress we must submit plugged IO.
*/
- if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fs_context.c b/fs/fs_context.c
index b7e43a780a62..24ce12f0db32 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -548,7 +548,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->key);
}
- if (len > PAGE_SIZE - 2 - size)
+ if (size + len + 2 > PAGE_SIZE)
return invalf(fc, "VFS: Legacy: Cumulative options too large");
if (strchr(param->key, ',') ||
(param->type == fs_value_is_string &&
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 3df07c0e32b3..ed40ce5742fd 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -199,6 +199,8 @@ int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p,
int b;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
+ if (!*param->string && (p->flags & fs_param_can_be_empty))
+ return 0;
b = lookup_constant(bool_names, param->string, -1);
if (b == -1)
return fs_param_bad_value(log, param);
@@ -211,8 +213,11 @@ int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
int base = (unsigned long)p->data;
- if (param->type != fs_value_is_string ||
- kstrtouint(param->string, base, &result->uint_32) < 0)
+ if (param->type != fs_value_is_string)
+ return fs_param_bad_value(log, param);
+ if (!*param->string && (p->flags & fs_param_can_be_empty))
+ return 0;
+ if (kstrtouint(param->string, base, &result->uint_32) < 0)
return fs_param_bad_value(log, param);
return 0;
}
@@ -221,8 +226,11 @@ EXPORT_SYMBOL(fs_param_is_u32);
int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
- if (param->type != fs_value_is_string ||
- kstrtoint(param->string, 0, &result->int_32) < 0)
+ if (param->type != fs_value_is_string)
+ return fs_param_bad_value(log, param);
+ if (!*param->string && (p->flags & fs_param_can_be_empty))
+ return 0;
+ if (kstrtoint(param->string, 0, &result->int_32) < 0)
return fs_param_bad_value(log, param);
return 0;
}
@@ -231,8 +239,11 @@ EXPORT_SYMBOL(fs_param_is_s32);
int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
- if (param->type != fs_value_is_string ||
- kstrtoull(param->string, 0, &result->uint_64) < 0)
+ if (param->type != fs_value_is_string)
+ return fs_param_bad_value(log, param);
+ if (!*param->string && (p->flags & fs_param_can_be_empty))
+ return 0;
+ if (kstrtoull(param->string, 0, &result->uint_64) < 0)
return fs_param_bad_value(log, param);
return 0;
}
@@ -244,6 +255,8 @@ int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p,
const struct constant_table *c;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
+ if (!*param->string && (p->flags & fs_param_can_be_empty))
+ return 0;
c = __lookup_constant(p->data, param->string);
if (!c)
return fs_param_bad_value(log, param);
@@ -255,7 +268,8 @@ EXPORT_SYMBOL(fs_param_is_enum);
int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
- if (param->type != fs_value_is_string || !*param->string)
+ if (param->type != fs_value_is_string ||
+ (!*param->string && !(p->flags & fs_param_can_be_empty)))
return fs_param_bad_value(log, param);
return 0;
}
@@ -275,7 +289,8 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
{
switch (param->type) {
case fs_value_is_string:
- if (kstrtouint(param->string, 0, &result->uint_32) < 0)
+ if ((!*param->string && !(p->flags & fs_param_can_be_empty)) ||
+ kstrtouint(param->string, 0, &result->uint_32) < 0)
break;
if (result->uint_32 <= INT_MAX)
return 0;
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 03a871d689bb..afb090ea16c4 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -6,13 +6,9 @@
fscache-y := \
cache.o \
cookie.o \
- fsdef.o \
io.o \
main.o \
- netfs.o \
- object.o \
- operation.o \
- page.o
+ volume.o
fscache-$(CONFIG_PROC_FS) += proc.o
fscache-$(CONFIG_FSCACHE_STATS) += stats.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index bd4f44c1cce0..d645f8b302a2 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -1,209 +1,229 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* FS-Cache cache handling
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#define FSCACHE_DEBUG_LEVEL CACHE
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include "internal.h"
-LIST_HEAD(fscache_cache_list);
+static LIST_HEAD(fscache_caches);
DECLARE_RWSEM(fscache_addremove_sem);
-DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
-EXPORT_SYMBOL(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_clearance_waiters);
+EXPORT_SYMBOL(fscache_clearance_waiters);
-static LIST_HEAD(fscache_cache_tag_list);
+static atomic_t fscache_cache_debug_id;
/*
- * look up a cache tag
+ * Allocate a cache cookie.
*/
-struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+static struct fscache_cache *fscache_alloc_cache(const char *name)
{
- struct fscache_cache_tag *tag, *xtag;
-
- /* firstly check for the existence of the tag under read lock */
- down_read(&fscache_addremove_sem);
-
- list_for_each_entry(tag, &fscache_cache_tag_list, link) {
- if (strcmp(tag->name, name) == 0) {
- atomic_inc(&tag->usage);
- up_read(&fscache_addremove_sem);
- return tag;
- }
- }
-
- up_read(&fscache_addremove_sem);
-
- /* the tag does not exist - create a candidate */
- xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
- if (!xtag)
- /* return a dummy tag if out of memory */
- return ERR_PTR(-ENOMEM);
-
- atomic_set(&xtag->usage, 1);
- strcpy(xtag->name, name);
-
- /* write lock, search again and add if still not present */
- down_write(&fscache_addremove_sem);
+ struct fscache_cache *cache;
- list_for_each_entry(tag, &fscache_cache_tag_list, link) {
- if (strcmp(tag->name, name) == 0) {
- atomic_inc(&tag->usage);
- up_write(&fscache_addremove_sem);
- kfree(xtag);
- return tag;
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+ if (cache) {
+ if (name) {
+ cache->name = kstrdup(name, GFP_KERNEL);
+ if (!cache->name) {
+ kfree(cache);
+ return NULL;
+ }
}
+ refcount_set(&cache->ref, 1);
+ INIT_LIST_HEAD(&cache->cache_link);
+ cache->debug_id = atomic_inc_return(&fscache_cache_debug_id);
}
-
- list_add_tail(&xtag->link, &fscache_cache_tag_list);
- up_write(&fscache_addremove_sem);
- return xtag;
+ return cache;
}
-/*
- * release a reference to a cache tag
- */
-void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+static bool fscache_get_cache_maybe(struct fscache_cache *cache,
+ enum fscache_cache_trace where)
{
- if (tag != ERR_PTR(-ENOMEM)) {
- down_write(&fscache_addremove_sem);
+ bool success;
+ int ref;
- if (atomic_dec_and_test(&tag->usage))
- list_del_init(&tag->link);
- else
- tag = NULL;
-
- up_write(&fscache_addremove_sem);
-
- kfree(tag);
- }
+ success = __refcount_inc_not_zero(&cache->ref, &ref);
+ if (success)
+ trace_fscache_cache(cache->debug_id, ref + 1, where);
+ return success;
}
/*
- * select a cache in which to store an object
- * - the cache addremove semaphore must be at least read-locked by the caller
- * - the object will never be an index
+ * Look up a cache cookie.
*/
-struct fscache_cache *fscache_select_cache_for_object(
- struct fscache_cookie *cookie)
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache)
{
- struct fscache_cache_tag *tag;
- struct fscache_object *object;
- struct fscache_cache *cache;
+ struct fscache_cache *candidate, *cache, *unnamed = NULL;
- _enter("");
+ /* firstly check for the existence of the cache under read lock */
+ down_read(&fscache_addremove_sem);
- if (list_empty(&fscache_cache_list)) {
- _leave(" = NULL [no cache]");
- return NULL;
+ list_for_each_entry(cache, &fscache_caches, cache_link) {
+ if (cache->name && name && strcmp(cache->name, name) == 0 &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_r;
+ if (!cache->name && !name &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_r;
}
- /* we check the parent to determine the cache to use */
- spin_lock(&cookie->lock);
+ if (!name) {
+ list_for_each_entry(cache, &fscache_caches, cache_link) {
+ if (cache->name &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_r;
+ }
+ }
- /* the first in the parent's backing list should be the preferred
- * cache */
- if (!hlist_empty(&cookie->backing_objects)) {
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
+ up_read(&fscache_addremove_sem);
- cache = object->cache;
- if (fscache_object_is_dying(object) ||
- test_bit(FSCACHE_IOERROR, &cache->flags))
- cache = NULL;
+ /* the cache does not exist - create a candidate */
+ candidate = fscache_alloc_cache(name);
+ if (!candidate)
+ return ERR_PTR(-ENOMEM);
- spin_unlock(&cookie->lock);
- _leave(" = %s [parent]", cache ? cache->tag->name : "NULL");
- return cache;
- }
+ /* write lock, search again and add if still not present */
+ down_write(&fscache_addremove_sem);
- /* the parent is unbacked */
- if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
- /* cookie not an index and is unbacked */
- spin_unlock(&cookie->lock);
- _leave(" = NULL [cookie ub,ni]");
- return NULL;
+ list_for_each_entry(cache, &fscache_caches, cache_link) {
+ if (cache->name && name && strcmp(cache->name, name) == 0 &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_w;
+ if (!cache->name) {
+ unnamed = cache;
+ if (!name &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_w;
+ }
}
- spin_unlock(&cookie->lock);
+ if (unnamed && is_cache &&
+ fscache_get_cache_maybe(unnamed, fscache_cache_get_acquire))
+ goto use_unnamed_cache;
- if (!cookie->def->select_cache)
- goto no_preference;
+ if (!name) {
+ list_for_each_entry(cache, &fscache_caches, cache_link) {
+ if (cache->name &&
+ fscache_get_cache_maybe(cache, fscache_cache_get_acquire))
+ goto got_cache_w;
+ }
+ }
- /* ask the netfs for its preference */
- tag = cookie->def->select_cache(cookie->parent->netfs_data,
- cookie->netfs_data);
- if (!tag)
- goto no_preference;
+ list_add_tail(&candidate->cache_link, &fscache_caches);
+ trace_fscache_cache(candidate->debug_id,
+ refcount_read(&candidate->ref),
+ fscache_cache_new_acquire);
+ up_write(&fscache_addremove_sem);
+ return candidate;
- if (tag == ERR_PTR(-ENOMEM)) {
- _leave(" = NULL [nomem tag]");
- return NULL;
- }
+got_cache_r:
+ up_read(&fscache_addremove_sem);
+ return cache;
+use_unnamed_cache:
+ cache = unnamed;
+ cache->name = candidate->name;
+ candidate->name = NULL;
+got_cache_w:
+ up_write(&fscache_addremove_sem);
+ kfree(candidate->name);
+ kfree(candidate);
+ return cache;
+}
- if (!tag->cache) {
- _leave(" = NULL [unbacked tag]");
- return NULL;
- }
+/**
+ * fscache_acquire_cache - Acquire a cache-level cookie.
+ * @name: The name of the cache.
+ *
+ * Get a cookie to represent an actual cache. If a name is given and there is
+ * a nameless cache record available, this will acquire that and set its name,
+ * directing all the volumes using it to this cache.
+ *
+ * The cache will be switched over to the preparing state if not currently in
+ * use, otherwise -EBUSY will be returned.
+ */
+struct fscache_cache *fscache_acquire_cache(const char *name)
+{
+ struct fscache_cache *cache;
- if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
- return NULL;
+ ASSERT(name);
+ cache = fscache_lookup_cache(name, true);
+ if (IS_ERR(cache))
+ return cache;
- _leave(" = %s [specific]", tag->name);
- return tag->cache;
+ if (!fscache_set_cache_state_maybe(cache,
+ FSCACHE_CACHE_IS_NOT_PRESENT,
+ FSCACHE_CACHE_IS_PREPARING)) {
+ pr_warn("Cache tag %s in use\n", name);
+ fscache_put_cache(cache, fscache_cache_put_cache);
+ return ERR_PTR(-EBUSY);
+ }
-no_preference:
- /* netfs has no preference - just select first cache */
- cache = list_entry(fscache_cache_list.next,
- struct fscache_cache, link);
- _leave(" = %s [first]", cache->tag->name);
return cache;
}
+EXPORT_SYMBOL(fscache_acquire_cache);
/**
- * fscache_init_cache - Initialise a cache record
- * @cache: The cache record to be initialised
- * @ops: The cache operations to be installed in that record
- * @idfmt: Format string to define identifier
- * @...: sprintf-style arguments
+ * fscache_put_cache - Release a cache-level cookie.
+ * @cache: The cache cookie to be released
+ * @where: An indication of where the release happened
*
- * Initialise a record of a cache and fill in the name.
- *
- * See Documentation/filesystems/caching/backend-api.rst for a complete
- * description.
+ * Release the caller's reference on a cache-level cookie. The @where
+ * indication should give information about the circumstances in which the call
+ * occurs and will be logged through a tracepoint.
*/
-void fscache_init_cache(struct fscache_cache *cache,
- const struct fscache_cache_ops *ops,
- const char *idfmt,
- ...)
+void fscache_put_cache(struct fscache_cache *cache,
+ enum fscache_cache_trace where)
{
- va_list va;
+ unsigned int debug_id = cache->debug_id;
+ bool zero;
+ int ref;
- memset(cache, 0, sizeof(*cache));
+ if (IS_ERR_OR_NULL(cache))
+ return;
- cache->ops = ops;
+ zero = __refcount_dec_and_test(&cache->ref, &ref);
+ trace_fscache_cache(debug_id, ref - 1, where);
- va_start(va, idfmt);
- vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
- va_end(va);
+ if (zero) {
+ down_write(&fscache_addremove_sem);
+ list_del_init(&cache->cache_link);
+ up_write(&fscache_addremove_sem);
+ kfree(cache->name);
+ kfree(cache);
+ }
+}
- INIT_WORK(&cache->op_gc, fscache_operation_gc);
- INIT_LIST_HEAD(&cache->link);
- INIT_LIST_HEAD(&cache->object_list);
- INIT_LIST_HEAD(&cache->op_gc_list);
- spin_lock_init(&cache->object_list_lock);
- spin_lock_init(&cache->op_gc_list_lock);
+/**
+ * fscache_relinquish_cache - Reset cache state and release cookie
+ * @cache: The cache cookie to be released
+ *
+ * Reset the state of a cache and release the caller's reference on a cache
+ * cookie.
+ */
+void fscache_relinquish_cache(struct fscache_cache *cache)
+{
+ enum fscache_cache_trace where =
+ (cache->state == FSCACHE_CACHE_IS_PREPARING) ?
+ fscache_cache_put_prep_failed :
+ fscache_cache_put_relinquish;
+
+ cache->ops = NULL;
+ cache->cache_priv = NULL;
+ fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT);
+ fscache_put_cache(cache, where);
}
-EXPORT_SYMBOL(fscache_init_cache);
+EXPORT_SYMBOL(fscache_relinquish_cache);
/**
* fscache_add_cache - Declare a cache as being open for business
- * @cache: The record describing the cache
- * @ifsdef: The record of the cache object describing the top-level index
- * @tagname: The tag describing this cache
+ * @cache: The cache-level cookie representing the cache
+ * @ops: Table of cache operations to use
+ * @cache_priv: Private data for the cache record
*
* Add a cache to the system, making it available for netfs's to use.
*
@@ -211,93 +231,97 @@ EXPORT_SYMBOL(fscache_init_cache);
* description.
*/
int fscache_add_cache(struct fscache_cache *cache,
- struct fscache_object *ifsdef,
- const char *tagname)
+ const struct fscache_cache_ops *ops,
+ void *cache_priv)
{
- struct fscache_cache_tag *tag;
-
- ASSERTCMP(ifsdef->cookie, ==, &fscache_fsdef_index);
- BUG_ON(!cache->ops);
- BUG_ON(!ifsdef);
+ int n_accesses;
- cache->flags = 0;
- ifsdef->event_mask =
- ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
+ _enter("{%s,%s}", ops->name, cache->name);
- if (!tagname)
- tagname = cache->identifier;
+ BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING);
- BUG_ON(!tagname[0]);
-
- _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
-
- /* we use the cache tag to uniquely identify caches */
- tag = __fscache_lookup_cache_tag(tagname);
- if (IS_ERR(tag))
- goto nomem;
-
- if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
- goto tag_in_use;
-
- cache->kobj = kobject_create_and_add(tagname, fscache_root);
- if (!cache->kobj)
- goto error;
-
- ifsdef->cache = cache;
- cache->fsdef = ifsdef;
+ /* Get a ref on the cache cookie and keep its n_accesses counter raised
+ * by 1 to prevent wakeups from transitioning it to 0 until we're
+ * withdrawing caching services from it.
+ */
+ n_accesses = atomic_inc_return(&cache->n_accesses);
+ trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref),
+ n_accesses, fscache_access_cache_pin);
down_write(&fscache_addremove_sem);
- tag->cache = cache;
- cache->tag = tag;
-
- /* add the cache to the list */
- list_add(&cache->link, &fscache_cache_list);
-
- /* add the cache's netfs definition index object to the cache's
- * list */
- spin_lock(&cache->object_list_lock);
- list_add_tail(&ifsdef->cache_link, &cache->object_list);
- spin_unlock(&cache->object_list_lock);
-
- /* add the cache's netfs definition index object to the top level index
- * cookie as a known backing object */
- spin_lock(&fscache_fsdef_index.lock);
-
- hlist_add_head(&ifsdef->cookie_link,
- &fscache_fsdef_index.backing_objects);
-
- refcount_inc(&fscache_fsdef_index.ref);
+ cache->ops = ops;
+ cache->cache_priv = cache_priv;
+ fscache_set_cache_state(cache, FSCACHE_CACHE_IS_ACTIVE);
- /* done */
- spin_unlock(&fscache_fsdef_index.lock);
up_write(&fscache_addremove_sem);
-
- pr_notice("Cache \"%s\" added (type %s)\n",
- cache->tag->name, cache->ops->name);
- kobject_uevent(cache->kobj, KOBJ_ADD);
-
- _leave(" = 0 [%s]", cache->identifier);
+ pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name);
+ _leave(" = 0 [%s]", cache->name);
return 0;
+}
+EXPORT_SYMBOL(fscache_add_cache);
-tag_in_use:
- pr_err("Cache tag '%s' already in use\n", tagname);
- __fscache_release_cache_tag(tag);
- _leave(" = -EXIST");
- return -EEXIST;
-
-error:
- __fscache_release_cache_tag(tag);
- _leave(" = -EINVAL");
- return -EINVAL;
+/**
+ * fscache_begin_cache_access - Pin a cache so it can be accessed
+ * @cache: The cache-level cookie
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Attempt to pin the cache to prevent it from going away whilst we're
+ * accessing it and returns true if successful. This works as follows:
+ *
+ * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE),
+ * then we return false to indicate access was not permitted.
+ *
+ * (2) If the cache tests as live, then we increment the n_accesses count and
+ * then recheck the liveness, ending the access if it ceased to be live.
+ *
+ * (3) When we end the access, we decrement n_accesses and wake up the any
+ * waiters if it reaches 0.
+ *
+ * (4) Whilst the cache is caching, n_accesses is kept artificially
+ * incremented to prevent wakeups from happening.
+ *
+ * (5) When the cache is taken offline, the state is changed to prevent new
+ * accesses, n_accesses is decremented and we wait for n_accesses to
+ * become 0.
+ */
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ if (!fscache_cache_is_live(cache))
+ return false;
+
+ n_accesses = atomic_inc_return(&cache->n_accesses);
+ smp_mb__after_atomic(); /* Reread live flag after n_accesses */
+ trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref),
+ n_accesses, why);
+ if (!fscache_cache_is_live(cache)) {
+ fscache_end_cache_access(cache, fscache_access_unlive);
+ return false;
+ }
+ return true;
+}
-nomem:
- _leave(" = -ENOMEM");
- return -ENOMEM;
+/**
+ * fscache_end_cache_access - Unpin a cache at the end of an access.
+ * @cache: The cache-level cookie
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Unpin a cache after we've accessed it. The @why indicator is merely
+ * provided for tracing purposes.
+ */
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ smp_mb__before_atomic();
+ n_accesses = atomic_dec_return(&cache->n_accesses);
+ trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref),
+ n_accesses, why);
+ if (n_accesses == 0)
+ wake_up_var(&cache->n_accesses);
}
-EXPORT_SYMBOL(fscache_add_cache);
/**
* fscache_io_error - Note a cache I/O error
@@ -311,106 +335,94 @@ EXPORT_SYMBOL(fscache_add_cache);
*/
void fscache_io_error(struct fscache_cache *cache)
{
- if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
+ if (fscache_set_cache_state_maybe(cache,
+ FSCACHE_CACHE_IS_ACTIVE,
+ FSCACHE_CACHE_GOT_IOERROR))
pr_err("Cache '%s' stopped due to I/O error\n",
- cache->ops->name);
+ cache->name);
}
EXPORT_SYMBOL(fscache_io_error);
-/*
- * request withdrawal of all the objects in a cache
- * - all the objects being withdrawn are moved onto the supplied list
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The cache cookie
+ *
+ * Begin the process of withdrawing a cache from service. This stops new
+ * cache-level and volume-level accesses from taking place and waits for
+ * currently ongoing cache-level accesses to end.
*/
-static void fscache_withdraw_all_objects(struct fscache_cache *cache,
- struct list_head *dying_objects)
+void fscache_withdraw_cache(struct fscache_cache *cache)
{
- struct fscache_object *object;
+ int n_accesses;
- while (!list_empty(&cache->object_list)) {
- spin_lock(&cache->object_list_lock);
+ pr_notice("Withdrawing cache \"%s\" (%u objs)\n",
+ cache->name, atomic_read(&cache->object_count));
- if (!list_empty(&cache->object_list)) {
- object = list_entry(cache->object_list.next,
- struct fscache_object, cache_link);
- list_move_tail(&object->cache_link, dying_objects);
+ fscache_set_cache_state(cache, FSCACHE_CACHE_IS_WITHDRAWN);
- _debug("withdraw %x", object->cookie->debug_id);
+ /* Allow wakeups on dec-to-0 */
+ n_accesses = atomic_dec_return(&cache->n_accesses);
+ trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref),
+ n_accesses, fscache_access_cache_unpin);
- /* This must be done under object_list_lock to prevent
- * a race with fscache_drop_object().
- */
- fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
- }
-
- spin_unlock(&cache->object_list_lock);
- cond_resched();
- }
+ wait_var_event(&cache->n_accesses,
+ atomic_read(&cache->n_accesses) == 0);
}
+EXPORT_SYMBOL(fscache_withdraw_cache);
-/**
- * fscache_withdraw_cache - Withdraw a cache from the active service
- * @cache: The record describing the cache
- *
- * Withdraw a cache from service, unbinding all its cache objects from the
- * netfs cookies they're currently representing.
- *
- * See Documentation/filesystems/caching/backend-api.rst for a complete
- * description.
+#ifdef CONFIG_PROC_FS
+static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW";
+
+/*
+ * Generate a list of caches in /proc/fs/fscache/caches
*/
-void fscache_withdraw_cache(struct fscache_cache *cache)
+static int fscache_caches_seq_show(struct seq_file *m, void *v)
{
- LIST_HEAD(dying_objects);
+ struct fscache_cache *cache;
- _enter("");
+ if (v == &fscache_caches) {
+ seq_puts(m,
+ "CACHE REF VOLS OBJS ACCES S NAME\n"
+ "======== ===== ===== ===== ===== = ===============\n"
+ );
+ return 0;
+ }
- pr_notice("Withdrawing cache \"%s\"\n",
- cache->tag->name);
+ cache = list_entry(v, struct fscache_cache, cache_link);
+ seq_printf(m,
+ "%08x %5d %5d %5d %5d %c %s\n",
+ cache->debug_id,
+ refcount_read(&cache->ref),
+ atomic_read(&cache->n_volumes),
+ atomic_read(&cache->object_count),
+ atomic_read(&cache->n_accesses),
+ fscache_cache_states[cache->state],
+ cache->name ?: "-");
+ return 0;
+}
- /* make the cache unavailable for cookie acquisition */
- if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
- BUG();
+static void *fscache_caches_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(fscache_addremove_sem)
+{
+ down_read(&fscache_addremove_sem);
+ return seq_list_start_head(&fscache_caches, *_pos);
+}
- down_write(&fscache_addremove_sem);
- list_del_init(&cache->link);
- cache->tag->cache = NULL;
- up_write(&fscache_addremove_sem);
+static void *fscache_caches_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &fscache_caches, _pos);
+}
- /* make sure all pages pinned by operations on behalf of the netfs are
- * written to disk */
- fscache_stat(&fscache_n_cop_sync_cache);
- cache->ops->sync_cache(cache);
- fscache_stat_d(&fscache_n_cop_sync_cache);
-
- /* dissociate all the netfs pages backed by this cache from the block
- * mappings in the cache */
- fscache_stat(&fscache_n_cop_dissociate_pages);
- cache->ops->dissociate_pages(cache);
- fscache_stat_d(&fscache_n_cop_dissociate_pages);
-
- /* we now have to destroy all the active objects pertaining to this
- * cache - which we do by passing them off to thread pool to be
- * disposed of */
- _debug("destroy");
-
- fscache_withdraw_all_objects(cache, &dying_objects);
-
- /* wait for all extant objects to finish their outstanding operations
- * and go away */
- _debug("wait for finish");
- wait_event(fscache_cache_cleared_wq,
- atomic_read(&cache->object_count) == 0);
- _debug("wait for clearance");
- wait_event(fscache_cache_cleared_wq,
- list_empty(&cache->object_list));
- _debug("cleared");
- ASSERT(list_empty(&dying_objects));
-
- kobject_put(cache->kobj);
-
- clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
- fscache_release_cache_tag(cache->tag);
- cache->tag = NULL;
-
- _leave("");
+static void fscache_caches_seq_stop(struct seq_file *m, void *v)
+ __releases(fscache_addremove_sem)
+{
+ up_read(&fscache_addremove_sem);
}
-EXPORT_SYMBOL(fscache_withdraw_cache);
+
+const struct seq_operations fscache_caches_seq_ops = {
+ .start = fscache_caches_seq_start,
+ .next = fscache_caches_seq_next,
+ .stop = fscache_caches_seq_stop,
+ .show = fscache_caches_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index cd42be646ed3..9d3cf0111709 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* netfs cookie management
*
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* See Documentation/filesystems/caching/netfs-api.rst for more information on
@@ -15,70 +15,258 @@
struct kmem_cache *fscache_cookie_jar;
-static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+static void fscache_cookie_lru_timed_out(struct timer_list *timer);
+static void fscache_cookie_lru_worker(struct work_struct *work);
+static void fscache_cookie_worker(struct work_struct *work);
+static void fscache_unhash_cookie(struct fscache_cookie *cookie);
+static void fscache_perform_invalidation(struct fscache_cookie *cookie);
#define fscache_cookie_hash_shift 15
static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift];
static LIST_HEAD(fscache_cookies);
static DEFINE_RWLOCK(fscache_cookies_lock);
-
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
- loff_t object_size);
-static int fscache_alloc_object(struct fscache_cache *cache,
- struct fscache_cookie *cookie);
-static int fscache_attach_object(struct fscache_cookie *cookie,
- struct fscache_object *object);
-
-static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
+static LIST_HEAD(fscache_cookie_lru);
+static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
+DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
+static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
+static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
+static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
+
+void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
{
- struct fscache_object *object;
- struct hlist_node *o;
const u8 *k;
- unsigned loop;
- pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n",
+ pr_err("%c-cookie c=%08x [fl=%lx na=%u nA=%u s=%c]\n",
prefix,
cookie->debug_id,
- cookie->parent ? cookie->parent->debug_id : 0,
cookie->flags,
- atomic_read(&cookie->n_children),
- atomic_read(&cookie->n_active));
- pr_err("%c-cookie d=%p{%s} n=%p\n",
+ atomic_read(&cookie->n_active),
+ atomic_read(&cookie->n_accesses),
+ fscache_cookie_states[cookie->state]);
+ pr_err("%c-cookie V=%08x [%s]\n",
prefix,
- cookie->def,
- cookie->def ? cookie->def->name : "?",
- cookie->netfs_data);
-
- o = READ_ONCE(cookie->backing_objects.first);
- if (o) {
- object = hlist_entry(o, struct fscache_object, cookie_link);
- pr_err("%c-cookie o=%u\n", prefix, object->debug_id);
- }
+ cookie->volume->debug_id,
+ cookie->volume->key);
- pr_err("%c-key=[%u] '", prefix, cookie->key_len);
k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
cookie->inline_key : cookie->key;
- for (loop = 0; loop < cookie->key_len; loop++)
- pr_cont("%02x", k[loop]);
- pr_cont("'\n");
+ pr_err("%c-key=[%u] '%*phN'\n", prefix, cookie->key_len, cookie->key_len, k);
}
-void fscache_free_cookie(struct fscache_cookie *cookie)
+static void fscache_free_cookie(struct fscache_cookie *cookie)
{
- if (cookie) {
- BUG_ON(!hlist_empty(&cookie->backing_objects));
- write_lock(&fscache_cookies_lock);
- list_del(&cookie->proc_link);
- write_unlock(&fscache_cookies_lock);
- if (cookie->aux_len > sizeof(cookie->inline_aux))
- kfree(cookie->aux);
- if (cookie->key_len > sizeof(cookie->inline_key))
- kfree(cookie->key);
- kmem_cache_free(fscache_cookie_jar, cookie);
+ if (WARN_ON_ONCE(!list_empty(&cookie->commit_link))) {
+ spin_lock(&fscache_cookie_lru_lock);
+ list_del_init(&cookie->commit_link);
+ spin_unlock(&fscache_cookie_lru_lock);
+ fscache_stat_d(&fscache_n_cookies_lru);
+ fscache_stat(&fscache_n_cookies_lru_removed);
+ }
+
+ if (WARN_ON_ONCE(test_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags))) {
+ fscache_print_cookie(cookie, 'F');
+ return;
}
+
+ write_lock(&fscache_cookies_lock);
+ list_del(&cookie->proc_link);
+ write_unlock(&fscache_cookies_lock);
+ if (cookie->aux_len > sizeof(cookie->inline_aux))
+ kfree(cookie->aux);
+ if (cookie->key_len > sizeof(cookie->inline_key))
+ kfree(cookie->key);
+ fscache_stat_d(&fscache_n_cookies);
+ kmem_cache_free(fscache_cookie_jar, cookie);
+}
+
+static void __fscache_queue_cookie(struct fscache_cookie *cookie)
+{
+ if (!queue_work(fscache_wq, &cookie->work))
+ fscache_put_cookie(cookie, fscache_cookie_put_over_queued);
+}
+
+static void fscache_queue_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ fscache_get_cookie(cookie, where);
+ __fscache_queue_cookie(cookie);
}
/*
+ * Initialise the access gate on a cookie by setting a flag to prevent the
+ * state machine from being queued when the access counter transitions to 0.
+ * We're only interested in this when we withdraw caching services from the
+ * cookie.
+ */
+static void fscache_init_access_gate(struct fscache_cookie *cookie)
+{
+ int n_accesses;
+
+ n_accesses = atomic_read(&cookie->n_accesses);
+ trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref),
+ n_accesses, fscache_access_cache_pin);
+ set_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags);
+}
+
+/**
+ * fscache_end_cookie_access - Unpin a cache at the end of an access.
+ * @cookie: A data file cookie
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Unpin a cache cookie after we've accessed it and bring a deferred
+ * relinquishment or withdrawal state into effect.
+ *
+ * The @why indicator is provided for tracing purposes.
+ */
+void fscache_end_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ smp_mb__before_atomic();
+ n_accesses = atomic_dec_return(&cookie->n_accesses);
+ trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref),
+ n_accesses, why);
+ if (n_accesses == 0 &&
+ !test_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags))
+ fscache_queue_cookie(cookie, fscache_cookie_get_end_access);
+}
+EXPORT_SYMBOL(fscache_end_cookie_access);
+
+/*
+ * Pin the cache behind a cookie so that we can access it.
+ */
+static void __fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ n_accesses = atomic_inc_return(&cookie->n_accesses);
+ smp_mb__after_atomic(); /* (Future) read state after is-caching.
+ * Reread n_accesses after is-caching
+ */
+ trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref),
+ n_accesses, why);
+}
+
+/**
+ * fscache_begin_cookie_access - Pin a cache so data can be accessed
+ * @cookie: A data file cookie
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Attempt to pin the cache to prevent it from going away whilst we're
+ * accessing data and returns true if successful. This works as follows:
+ *
+ * (1) If the cookie is not being cached (ie. FSCACHE_COOKIE_IS_CACHING is not
+ * set), we return false to indicate access was not permitted.
+ *
+ * (2) If the cookie is being cached, we increment its n_accesses count and
+ * then recheck the IS_CACHING flag, ending the access if it got cleared.
+ *
+ * (3) When we end the access, we decrement the cookie's n_accesses and wake
+ * up the any waiters if it reaches 0.
+ *
+ * (4) Whilst the cookie is actively being cached, its n_accesses is kept
+ * artificially incremented to prevent wakeups from happening.
+ *
+ * (5) When the cache is taken offline or if the cookie is culled, the flag is
+ * cleared to prevent new accesses, the cookie's n_accesses is decremented
+ * and we wait for it to become 0.
+ *
+ * The @why indicator are merely provided for tracing purposes.
+ */
+bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags))
+ return false;
+ __fscache_begin_cookie_access(cookie, why);
+ if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags) ||
+ !fscache_cache_is_live(cookie->volume->cache)) {
+ fscache_end_cookie_access(cookie, fscache_access_unlive);
+ return false;
+ }
+ return true;
+}
+
+static inline void wake_up_cookie_state(struct fscache_cookie *cookie)
+{
+ /* Use a barrier to ensure that waiters see the state variable
+ * change, as spin_unlock doesn't guarantee a barrier.
+ *
+ * See comments over wake_up_bit() and waitqueue_active().
+ */
+ smp_mb();
+ wake_up_var(&cookie->state);
+}
+
+/*
+ * Change the state a cookie is at and wake up anyone waiting for that. Impose
+ * an ordering between the stuff stored in the cookie and the state member.
+ * Paired with fscache_cookie_state().
+ */
+static void __fscache_set_cookie_state(struct fscache_cookie *cookie,
+ enum fscache_cookie_state state)
+{
+ smp_store_release(&cookie->state, state);
+}
+
+static void fscache_set_cookie_state(struct fscache_cookie *cookie,
+ enum fscache_cookie_state state)
+{
+ spin_lock(&cookie->lock);
+ __fscache_set_cookie_state(cookie, state);
+ spin_unlock(&cookie->lock);
+ wake_up_cookie_state(cookie);
+}
+
+/**
+ * fscache_cookie_lookup_negative - Note negative lookup
+ * @cookie: The cookie that was being looked up
+ *
+ * Note that some part of the metadata path in the cache doesn't exist and so
+ * we can release any waiting readers in the certain knowledge that there's
+ * nothing for them to actually read.
+ *
+ * This function uses no locking and must only be called from the state machine.
+ */
+void fscache_cookie_lookup_negative(struct fscache_cookie *cookie)
+{
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_CREATING);
+}
+EXPORT_SYMBOL(fscache_cookie_lookup_negative);
+
+/**
+ * fscache_resume_after_invalidation - Allow I/O to resume after invalidation
+ * @cookie: The cookie that was invalidated
+ *
+ * Tell fscache that invalidation is sufficiently complete that I/O can be
+ * allowed again.
+ */
+void fscache_resume_after_invalidation(struct fscache_cookie *cookie)
+{
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+}
+EXPORT_SYMBOL(fscache_resume_after_invalidation);
+
+/**
+ * fscache_caching_failed - Report that a failure stopped caching on a cookie
+ * @cookie: The cookie that was affected
+ *
+ * Tell fscache that caching on a cookie needs to be stopped due to some sort
+ * of failure.
+ *
+ * This function uses no locking and must only be called from the state machine.
+ */
+void fscache_caching_failed(struct fscache_cookie *cookie)
+{
+ clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags);
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED);
+}
+EXPORT_SYMBOL(fscache_caching_failed);
+
+/*
* Set the index key in a cookie. The cookie struct has space for a 16-byte
* key plus length and hash, but if that's not big enough, it's instead a
* pointer to a buffer containing 3 bytes of hash, 1 byte of length and then
@@ -87,38 +275,35 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
static int fscache_set_key(struct fscache_cookie *cookie,
const void *index_key, size_t index_key_len)
{
- u32 *buf;
- int bufs;
+ void *buf;
+ size_t buf_size;
- bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf));
+ buf_size = round_up(index_key_len, sizeof(__le32));
if (index_key_len > sizeof(cookie->inline_key)) {
- buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL);
+ buf = kzalloc(buf_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
cookie->key = buf;
} else {
- buf = (u32 *)cookie->inline_key;
+ buf = cookie->inline_key;
}
memcpy(buf, index_key, index_key_len);
- cookie->key_hash = fscache_hash(0, buf, bufs);
+ cookie->key_hash = fscache_hash(cookie->volume->key_hash,
+ buf, buf_size);
return 0;
}
-static long fscache_compare_cookie(const struct fscache_cookie *a,
- const struct fscache_cookie *b)
+static bool fscache_cookie_same(const struct fscache_cookie *a,
+ const struct fscache_cookie *b)
{
const void *ka, *kb;
- if (a->key_hash != b->key_hash)
- return (long)a->key_hash - (long)b->key_hash;
- if (a->parent != b->parent)
- return (long)a->parent - (long)b->parent;
- if (a->key_len != b->key_len)
- return (long)a->key_len - (long)b->key_len;
- if (a->type != b->type)
- return (long)a->type - (long)b->type;
+ if (a->key_hash != b->key_hash ||
+ a->volume != b->volume ||
+ a->key_len != b->key_len)
+ return false;
if (a->key_len <= sizeof(a->inline_key)) {
ka = &a->inline_key;
@@ -127,7 +312,7 @@ static long fscache_compare_cookie(const struct fscache_cookie *a,
ka = a->key;
kb = b->key;
}
- return memcmp(ka, kb, a->key_len);
+ return memcmp(ka, kb, a->key_len) == 0;
}
static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1);
@@ -135,12 +320,11 @@ static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1);
/*
* Allocate a cookie.
*/
-struct fscache_cookie *fscache_alloc_cookie(
- struct fscache_cookie *parent,
- const struct fscache_cookie_def *def,
+static struct fscache_cookie *fscache_alloc_cookie(
+ struct fscache_volume *volume,
+ u8 advice,
const void *index_key, size_t index_key_len,
const void *aux_data, size_t aux_data_len,
- void *netfs_data,
loff_t object_size)
{
struct fscache_cookie *cookie;
@@ -149,9 +333,15 @@ struct fscache_cookie *fscache_alloc_cookie(
cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
if (!cookie)
return NULL;
+ fscache_stat(&fscache_n_cookies);
- cookie->key_len = index_key_len;
- cookie->aux_len = aux_data_len;
+ cookie->volume = volume;
+ cookie->advice = advice;
+ cookie->key_len = index_key_len;
+ cookie->aux_len = aux_data_len;
+ cookie->object_size = object_size;
+ if (object_size == 0)
+ __set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
if (fscache_set_key(cookie, index_key, index_key_len) < 0)
goto nomem;
@@ -165,30 +355,16 @@ struct fscache_cookie *fscache_alloc_cookie(
}
refcount_set(&cookie->ref, 1);
- atomic_set(&cookie->n_children, 0);
cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id);
-
- /* We keep the active count elevated until relinquishment to prevent an
- * attempt to wake up every time the object operations queue quiesces.
- */
- atomic_set(&cookie->n_active, 1);
-
- cookie->def = def;
- cookie->parent = parent;
- cookie->netfs_data = netfs_data;
- cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
- cookie->type = def->type;
spin_lock_init(&cookie->lock);
- spin_lock_init(&cookie->stores_lock);
- INIT_HLIST_HEAD(&cookie->backing_objects);
-
- /* radix tree insertion won't use the preallocation pool unless it's
- * told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_LIST_HEAD(&cookie->commit_link);
+ INIT_WORK(&cookie->work, fscache_cookie_worker);
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
write_lock(&fscache_cookies_lock);
list_add_tail(&cookie->proc_link, &fscache_cookies);
write_unlock(&fscache_cookies_lock);
+ fscache_see_cookie(cookie, fscache_cookie_new_acquire);
return cookie;
nomem:
@@ -196,13 +372,28 @@ nomem:
return NULL;
}
+static void fscache_wait_on_collision(struct fscache_cookie *candidate,
+ struct fscache_cookie *wait_for)
+{
+ enum fscache_cookie_state *statep = &wait_for->state;
+
+ wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED,
+ 20 * HZ);
+ if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) {
+ pr_notice("Potential collision c=%08x old: c=%08x",
+ candidate->debug_id, wait_for->debug_id);
+ wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED);
+ }
+}
+
/*
* Attempt to insert the new cookie into the hash. If there's a collision, we
- * return the old cookie if it's not in use and an error otherwise.
+ * wait for the old cookie to complete if it's being relinquished and an error
+ * otherwise.
*/
-struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
+static bool fscache_hash_cookie(struct fscache_cookie *candidate)
{
- struct fscache_cookie *cursor;
+ struct fscache_cookie *cursor, *wait_for = NULL;
struct hlist_bl_head *h;
struct hlist_bl_node *p;
unsigned int bucket;
@@ -212,64 +403,53 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
hlist_bl_lock(h);
hlist_bl_for_each_entry(cursor, p, h, hash_link) {
- if (fscache_compare_cookie(candidate, cursor) == 0)
- goto collision;
+ if (fscache_cookie_same(candidate, cursor)) {
+ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cursor->flags))
+ goto collision;
+ wait_for = fscache_get_cookie(cursor,
+ fscache_cookie_get_hash_collision);
+ break;
+ }
}
- __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags);
- fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent);
- atomic_inc(&candidate->parent->n_children);
+ fscache_get_volume(candidate->volume, fscache_volume_get_cookie);
+ atomic_inc(&candidate->volume->n_cookies);
hlist_bl_add_head(&candidate->hash_link, h);
+ set_bit(FSCACHE_COOKIE_IS_HASHED, &candidate->flags);
hlist_bl_unlock(h);
- return candidate;
-collision:
- if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) {
- trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref),
- fscache_cookie_collision);
- pr_err("Duplicate cookie detected\n");
- fscache_print_cookie(cursor, 'O');
- fscache_print_cookie(candidate, 'N');
- hlist_bl_unlock(h);
- return NULL;
+ if (wait_for) {
+ fscache_wait_on_collision(candidate, wait_for);
+ fscache_put_cookie(wait_for, fscache_cookie_put_hash_collision);
}
+ return true;
- fscache_cookie_get(cursor, fscache_cookie_get_reacquire);
+collision:
+ trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref),
+ fscache_cookie_collision);
+ pr_err("Duplicate cookie detected\n");
+ fscache_print_cookie(cursor, 'O');
+ fscache_print_cookie(candidate, 'N');
hlist_bl_unlock(h);
- return cursor;
+ return false;
}
/*
- * request a cookie to represent an object (index, datafile, xattr, etc)
- * - parent specifies the parent object
- * - the top level index cookie for each netfs is stored in the fscache_netfs
- * struct upon registration
- * - def points to the definition
- * - the netfs_data will be passed to the functions pointed to in *def
- * - all attached caches will be searched to see if they contain this object
- * - index objects aren't stored on disk until there's a dependent file that
- * needs storing
- * - other objects are stored in a selected cache immediately, and all the
- * indices forming the path to it are instantiated if necessary
- * - we never let on to the netfs about errors
- * - we may set a negative cookie pointer, but that's okay
+ * Request a cookie to represent a data storage object within a volume.
+ *
+ * We never let on to the netfs about errors. We may set a negative cookie
+ * pointer, but that's okay
*/
struct fscache_cookie *__fscache_acquire_cookie(
- struct fscache_cookie *parent,
- const struct fscache_cookie_def *def,
+ struct fscache_volume *volume,
+ u8 advice,
const void *index_key, size_t index_key_len,
const void *aux_data, size_t aux_data_len,
- void *netfs_data,
- loff_t object_size,
- bool enable)
+ loff_t object_size)
{
- struct fscache_cookie *candidate, *cookie;
-
- BUG_ON(!def);
+ struct fscache_cookie *cookie;
- _enter("{%s},{%s},%p,%u",
- parent ? (char *) parent->def->name : "<no-parent>",
- def->name, netfs_data, enable);
+ _enter("V=%x", volume->debug_id);
if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255)
return NULL;
@@ -280,563 +460,440 @@ struct fscache_cookie *__fscache_acquire_cookie(
fscache_stat(&fscache_n_acquires);
- /* if there's no parent cookie, then we don't create one here either */
- if (!parent) {
- fscache_stat(&fscache_n_acquires_null);
- _leave(" [no parent]");
- return NULL;
- }
-
- /* validate the definition */
- BUG_ON(!def->name[0]);
-
- BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
- parent->type != FSCACHE_COOKIE_TYPE_INDEX);
-
- candidate = fscache_alloc_cookie(parent, def,
- index_key, index_key_len,
- aux_data, aux_data_len,
- netfs_data, object_size);
- if (!candidate) {
+ cookie = fscache_alloc_cookie(volume, advice,
+ index_key, index_key_len,
+ aux_data, aux_data_len,
+ object_size);
+ if (!cookie) {
fscache_stat(&fscache_n_acquires_oom);
- _leave(" [ENOMEM]");
return NULL;
}
- cookie = fscache_hash_cookie(candidate);
- if (!cookie) {
- trace_fscache_cookie(candidate->debug_id, 1,
- fscache_cookie_discard);
- goto out;
- }
-
- if (cookie == candidate)
- candidate = NULL;
-
- switch (cookie->type) {
- case FSCACHE_COOKIE_TYPE_INDEX:
- fscache_stat(&fscache_n_cookie_index);
- break;
- case FSCACHE_COOKIE_TYPE_DATAFILE:
- fscache_stat(&fscache_n_cookie_data);
- break;
- default:
- fscache_stat(&fscache_n_cookie_special);
- break;
+ if (!fscache_hash_cookie(cookie)) {
+ fscache_see_cookie(cookie, fscache_cookie_discard);
+ fscache_free_cookie(cookie);
+ return NULL;
}
trace_fscache_acquire(cookie);
-
- if (enable) {
- /* if the object is an index then we need do nothing more here
- * - we create indices on disk when we need them as an index
- * may exist in multiple caches */
- if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) {
- set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
- } else {
- atomic_dec(&parent->n_children);
- fscache_cookie_put(cookie,
- fscache_cookie_put_acquire_nobufs);
- fscache_stat(&fscache_n_acquires_nobufs);
- _leave(" = NULL");
- return NULL;
- }
- } else {
- set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
- }
- }
-
fscache_stat(&fscache_n_acquires_ok);
-
-out:
- fscache_free_cookie(candidate);
+ _leave(" = c=%08x", cookie->debug_id);
return cookie;
}
EXPORT_SYMBOL(__fscache_acquire_cookie);
/*
- * Enable a cookie to permit it to accept new operations.
+ * Prepare a cache object to be written to.
*/
-void __fscache_enable_cookie(struct fscache_cookie *cookie,
- const void *aux_data,
- loff_t object_size,
- bool (*can_enable)(void *data),
- void *data)
+static void fscache_prepare_to_write(struct fscache_cookie *cookie)
{
- _enter("%x", cookie->debug_id);
-
- trace_fscache_enable(cookie);
-
- wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
- TASK_UNINTERRUPTIBLE);
-
- fscache_update_aux(cookie, aux_data);
-
- if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
- goto out_unlock;
-
- if (can_enable && !can_enable(data)) {
- /* The netfs decided it didn't want to enable after all */
- } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
- /* Wait for outstanding disablement to complete */
- __fscache_wait_on_invalidate(cookie);
-
- if (fscache_acquire_non_index_cookie(cookie, object_size) == 0)
- set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
- } else {
- set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
- }
-
-out_unlock:
- clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+ cookie->volume->cache->ops->prepare_to_write(cookie);
}
-EXPORT_SYMBOL(__fscache_enable_cookie);
/*
- * acquire a non-index cookie
- * - this must make sure the index chain is instantiated and instantiate the
- * object representation too
+ * Look up a cookie in the cache.
*/
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
- loff_t object_size)
+static void fscache_perform_lookup(struct fscache_cookie *cookie)
{
- struct fscache_object *object;
- struct fscache_cache *cache;
- int ret;
+ enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed;
+ bool need_withdraw = false;
_enter("");
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
-
- /* now we need to see whether the backing objects for this cookie yet
- * exist, if not there'll be nothing to search */
- down_read(&fscache_addremove_sem);
-
- if (list_empty(&fscache_cache_list)) {
- up_read(&fscache_addremove_sem);
- _leave(" = 0 [no caches]");
- return 0;
- }
-
- /* select a cache in which to store the object */
- cache = fscache_select_cache_for_object(cookie->parent);
- if (!cache) {
- up_read(&fscache_addremove_sem);
- fscache_stat(&fscache_n_acquires_no_cache);
- _leave(" = -ENOMEDIUM [no cache]");
- return -ENOMEDIUM;
- }
-
- _debug("cache %s", cache->tag->name);
-
- set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-
- /* ask the cache to allocate objects for this cookie and its parent
- * chain */
- ret = fscache_alloc_object(cache, cookie);
- if (ret < 0) {
- up_read(&fscache_addremove_sem);
- _leave(" = %d", ret);
- return ret;
- }
-
- spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects)) {
- spin_unlock(&cookie->lock);
- goto unavailable;
+ if (!cookie->volume->cache_priv) {
+ fscache_create_volume(cookie->volume, true);
+ if (!cookie->volume->cache_priv) {
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
+ goto out;
+ }
}
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- fscache_set_store_limit(object, object_size);
-
- /* initiate the process of looking up all the objects in the chain
- * (done by fscache_initialise_object()) */
- fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
-
- spin_unlock(&cookie->lock);
-
- /* we may be required to wait for lookup to complete at this point */
- if (!fscache_defer_lookup) {
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
- TASK_UNINTERRUPTIBLE);
- if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
- goto unavailable;
+ if (!cookie->volume->cache->ops->lookup_cookie(cookie)) {
+ if (cookie->state != FSCACHE_COOKIE_STATE_FAILED)
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
+ need_withdraw = true;
+ _leave(" [fail]");
+ goto out;
}
- up_read(&fscache_addremove_sem);
- _leave(" = 0 [deferred]");
- return 0;
+ fscache_see_cookie(cookie, fscache_cookie_see_active);
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ trace = fscache_access_lookup_cookie_end;
-unavailable:
- up_read(&fscache_addremove_sem);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
+out:
+ fscache_end_cookie_access(cookie, trace);
+ if (need_withdraw)
+ fscache_withdraw_cookie(cookie);
+ fscache_end_volume_access(cookie->volume, cookie, trace);
}
/*
- * recursively allocate cache object records for a cookie/cache combination
- * - caller must be holding the addremove sem
+ * Begin the process of looking up a cookie. We offload the actual process to
+ * a worker thread.
*/
-static int fscache_alloc_object(struct fscache_cache *cache,
- struct fscache_cookie *cookie)
+static bool fscache_begin_lookup(struct fscache_cookie *cookie, bool will_modify)
{
- struct fscache_object *object;
- int ret;
-
- _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name);
-
- spin_lock(&cookie->lock);
- hlist_for_each_entry(object, &cookie->backing_objects,
- cookie_link) {
- if (object->cache == cache)
- goto object_already_extant;
+ if (will_modify) {
+ set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags);
+ set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
}
- spin_unlock(&cookie->lock);
-
- /* ask the cache to allocate an object (we may end up with duplicate
- * objects at this stage, but we sort that out later) */
- fscache_stat(&fscache_n_cop_alloc_object);
- object = cache->ops->alloc_object(cache, cookie);
- fscache_stat_d(&fscache_n_cop_alloc_object);
- if (IS_ERR(object)) {
- fscache_stat(&fscache_n_object_no_alloc);
- ret = PTR_ERR(object);
- goto error;
- }
-
- ASSERTCMP(object->cookie, ==, cookie);
- fscache_stat(&fscache_n_object_alloc);
-
- object->debug_id = atomic_inc_return(&fscache_object_debug_id);
-
- _debug("ALLOC OBJ%x: %s {%lx}",
- object->debug_id, cookie->def->name, object->events);
-
- ret = fscache_alloc_object(cache, cookie->parent);
- if (ret < 0)
- goto error_put;
-
- /* only attach if we managed to allocate all we needed, otherwise
- * discard the object we just allocated and instead use the one
- * attached to the cookie */
- if (fscache_attach_object(cookie, object) < 0) {
- fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object, fscache_obj_put_attach_fail);
- fscache_stat_d(&fscache_n_cop_put_object);
- }
-
- _leave(" = 0");
- return 0;
-
-object_already_extant:
- ret = -ENOBUFS;
- if (fscache_object_is_dying(object) ||
- fscache_cache_is_broken(object)) {
- spin_unlock(&cookie->lock);
- goto error;
- }
- spin_unlock(&cookie->lock);
- _leave(" = 0 [found]");
- return 0;
-
-error_put:
- fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object, fscache_obj_put_alloc_fail);
- fscache_stat_d(&fscache_n_cop_put_object);
-error:
- _leave(" = %d", ret);
- return ret;
+ if (!fscache_begin_volume_access(cookie->volume, cookie,
+ fscache_access_lookup_cookie))
+ return false;
+
+ __fscache_begin_cookie_access(cookie, fscache_access_lookup_cookie);
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LOOKING_UP);
+ set_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags);
+ set_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags);
+ return true;
}
/*
- * attach a cache object to a cookie
+ * Start using the cookie for I/O. This prevents the backing object from being
+ * reaped by VM pressure.
*/
-static int fscache_attach_object(struct fscache_cookie *cookie,
- struct fscache_object *object)
+void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify)
{
- struct fscache_object *p;
- struct fscache_cache *cache = object->cache;
- int ret;
+ enum fscache_cookie_state state;
+ bool queue = false;
+ int n_active;
- _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+ _enter("c=%08x", cookie->debug_id);
- ASSERTCMP(object->cookie, ==, cookie);
+ if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
+ "Trying to use relinquished cookie\n"))
+ return;
spin_lock(&cookie->lock);
- /* there may be multiple initial creations of this object, but we only
- * want one */
- ret = -EEXIST;
- hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
- if (p->cache == object->cache) {
- if (fscache_object_is_dying(p))
- ret = -ENOBUFS;
- goto cant_attach_object;
- }
- }
+ n_active = atomic_inc_return(&cookie->n_active);
+ trace_fscache_active(cookie->debug_id, refcount_read(&cookie->ref),
+ n_active, atomic_read(&cookie->n_accesses),
+ will_modify ?
+ fscache_active_use_modify : fscache_active_use);
+
+again:
+ state = fscache_cookie_state(cookie);
+ switch (state) {
+ case FSCACHE_COOKIE_STATE_QUIESCENT:
+ queue = fscache_begin_lookup(cookie, will_modify);
+ break;
- /* pin the parent object */
- spin_lock_nested(&cookie->parent->lock, 1);
- hlist_for_each_entry(p, &cookie->parent->backing_objects,
- cookie_link) {
- if (p->cache == object->cache) {
- if (fscache_object_is_dying(p)) {
- ret = -ENOBUFS;
- spin_unlock(&cookie->parent->lock);
- goto cant_attach_object;
- }
- object->parent = p;
- spin_lock(&p->lock);
- p->n_children++;
- spin_unlock(&p->lock);
- break;
+ case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ case FSCACHE_COOKIE_STATE_CREATING:
+ if (will_modify)
+ set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags);
+ break;
+ case FSCACHE_COOKIE_STATE_ACTIVE:
+ case FSCACHE_COOKIE_STATE_INVALIDATING:
+ if (will_modify &&
+ !test_and_set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags)) {
+ set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
+ queue = true;
}
- }
- spin_unlock(&cookie->parent->lock);
-
- /* attach to the cache's object list */
- if (list_empty(&object->cache_link)) {
- spin_lock(&cache->object_list_lock);
- list_add(&object->cache_link, &cache->object_list);
- spin_unlock(&cache->object_list_lock);
- }
-
- /* Attach to the cookie. The object already has a ref on it. */
- hlist_add_head(&object->cookie_link, &cookie->backing_objects);
- ret = 0;
-
-cant_attach_object:
- spin_unlock(&cookie->lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * Invalidate an object. Callable with spinlocks held.
- */
-void __fscache_invalidate(struct fscache_cookie *cookie)
-{
- struct fscache_object *object;
-
- _enter("{%s}", cookie->def->name);
-
- fscache_stat(&fscache_n_invalidates);
+ break;
- /* Only permit invalidation of data files. Invalidating an index will
- * require the caller to release all its attachments to the tree rooted
- * there, and if it's doing that, it may as well just retire the
- * cookie.
- */
- ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+ case FSCACHE_COOKIE_STATE_FAILED:
+ case FSCACHE_COOKIE_STATE_WITHDRAWING:
+ break;
- /* If there's an object, we tell the object state machine to handle the
- * invalidation on our behalf, otherwise there's nothing to do.
- */
- if (!hlist_empty(&cookie->backing_objects)) {
+ case FSCACHE_COOKIE_STATE_LRU_DISCARDING:
+ spin_unlock(&cookie->lock);
+ wait_var_event(&cookie->state,
+ fscache_cookie_state(cookie) !=
+ FSCACHE_COOKIE_STATE_LRU_DISCARDING);
spin_lock(&cookie->lock);
+ goto again;
- if (fscache_cookie_enabled(cookie) &&
- !hlist_empty(&cookie->backing_objects) &&
- !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
- &cookie->flags)) {
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object,
- cookie_link);
- if (fscache_object_is_live(object))
- fscache_raise_event(
- object, FSCACHE_OBJECT_EV_INVALIDATE);
- }
-
- spin_unlock(&cookie->lock);
+ case FSCACHE_COOKIE_STATE_DROPPED:
+ case FSCACHE_COOKIE_STATE_RELINQUISHING:
+ WARN(1, "Can't use cookie in state %u\n", state);
+ break;
}
+ spin_unlock(&cookie->lock);
+ if (queue)
+ fscache_queue_cookie(cookie, fscache_cookie_get_use_work);
_leave("");
}
-EXPORT_SYMBOL(__fscache_invalidate);
+EXPORT_SYMBOL(__fscache_use_cookie);
-/*
- * Wait for object invalidation to complete.
- */
-void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
+static void fscache_unuse_cookie_locked(struct fscache_cookie *cookie)
{
- _enter("%x", cookie->debug_id);
+ clear_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags);
+ if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags))
+ return;
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
- TASK_UNINTERRUPTIBLE);
+ cookie->unused_at = jiffies;
+ spin_lock(&fscache_cookie_lru_lock);
+ if (list_empty(&cookie->commit_link)) {
+ fscache_get_cookie(cookie, fscache_cookie_get_lru);
+ fscache_stat(&fscache_n_cookies_lru);
+ }
+ list_move_tail(&cookie->commit_link, &fscache_cookie_lru);
- _leave("");
+ spin_unlock(&fscache_cookie_lru_lock);
+ timer_reduce(&fscache_cookie_lru_timer,
+ jiffies + fscache_lru_cookie_timeout);
}
-EXPORT_SYMBOL(__fscache_wait_on_invalidate);
/*
- * update the index entries backing a cookie
+ * Stop using the cookie for I/O.
*/
-void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data)
+void __fscache_unuse_cookie(struct fscache_cookie *cookie,
+ const void *aux_data, const loff_t *object_size)
{
- struct fscache_object *object;
-
- fscache_stat(&fscache_n_updates);
-
- if (!cookie) {
- fscache_stat(&fscache_n_updates_null);
- _leave(" [no cookie]");
+ unsigned int debug_id = cookie->debug_id;
+ unsigned int r = refcount_read(&cookie->ref);
+ unsigned int a = atomic_read(&cookie->n_accesses);
+ unsigned int c;
+
+ if (aux_data || object_size)
+ __fscache_update_cookie(cookie, aux_data, object_size);
+
+ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+ c = atomic_fetch_add_unless(&cookie->n_active, -1, 1);
+ if (c != 1) {
+ trace_fscache_active(debug_id, r, c - 1, a, fscache_active_unuse);
return;
}
- _enter("{%s}", cookie->def->name);
-
spin_lock(&cookie->lock);
-
- fscache_update_aux(cookie, aux_data);
-
- if (fscache_cookie_enabled(cookie)) {
- /* update the index entry on disk in each cache backing this
- * cookie.
- */
- hlist_for_each_entry(object,
- &cookie->backing_objects, cookie_link) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
- }
- }
-
+ r = refcount_read(&cookie->ref);
+ a = atomic_read(&cookie->n_accesses);
+ c = atomic_dec_return(&cookie->n_active);
+ trace_fscache_active(debug_id, r, c, a, fscache_active_unuse);
+ if (c == 0)
+ fscache_unuse_cookie_locked(cookie);
spin_unlock(&cookie->lock);
- _leave("");
}
-EXPORT_SYMBOL(__fscache_update_cookie);
+EXPORT_SYMBOL(__fscache_unuse_cookie);
/*
- * Disable a cookie to stop it from accepting new requests from the netfs.
+ * Perform work upon the cookie, such as committing its cache state,
+ * relinquishing it or withdrawing the backing cache. We're protected from the
+ * cache going away under us as object withdrawal must come through this
+ * non-reentrant work item.
*/
-void __fscache_disable_cookie(struct fscache_cookie *cookie,
- const void *aux_data,
- bool invalidate)
+static void fscache_cookie_state_machine(struct fscache_cookie *cookie)
{
- struct fscache_object *object;
- bool awaken = false;
+ enum fscache_cookie_state state;
+ bool wake = false;
- _enter("%x,%u", cookie->debug_id, invalidate);
+ _enter("c=%x", cookie->debug_id);
- trace_fscache_disable(cookie);
-
- ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
-
- if (atomic_read(&cookie->n_children) != 0) {
- pr_err("Cookie '%s' still has children\n",
- cookie->def->name);
- BUG();
- }
+again:
+ spin_lock(&cookie->lock);
+again_locked:
+ state = cookie->state;
+ switch (state) {
+ case FSCACHE_COOKIE_STATE_QUIESCENT:
+ /* The QUIESCENT state is jumped to the LOOKING_UP state by
+ * fscache_use_cookie().
+ */
- wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
- TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&cookie->n_accesses) == 0 &&
+ test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) {
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_RELINQUISHING);
+ wake = true;
+ goto again_locked;
+ }
+ break;
- fscache_update_aux(cookie, aux_data);
+ case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ spin_unlock(&cookie->lock);
+ fscache_init_access_gate(cookie);
+ fscache_perform_lookup(cookie);
+ goto again;
- if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
- goto out_unlock_enable;
+ case FSCACHE_COOKIE_STATE_INVALIDATING:
+ spin_unlock(&cookie->lock);
+ fscache_perform_invalidation(cookie);
+ goto again;
+
+ case FSCACHE_COOKIE_STATE_ACTIVE:
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags)) {
+ spin_unlock(&cookie->lock);
+ fscache_prepare_to_write(cookie);
+ spin_lock(&cookie->lock);
+ }
+ if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) {
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_LRU_DISCARDING);
+ wake = true;
+ goto again_locked;
+ }
+ fallthrough;
- /* If the cookie is being invalidated, wait for that to complete first
- * so that we can reuse the flag.
- */
- __fscache_wait_on_invalidate(cookie);
+ case FSCACHE_COOKIE_STATE_FAILED:
+ if (atomic_read(&cookie->n_accesses) != 0)
+ break;
+ if (test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) {
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_RELINQUISHING);
+ wake = true;
+ goto again_locked;
+ }
+ if (test_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags)) {
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_WITHDRAWING);
+ wake = true;
+ goto again_locked;
+ }
+ break;
- /* Dispose of the backing objects */
- set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
+ case FSCACHE_COOKIE_STATE_LRU_DISCARDING:
+ case FSCACHE_COOKIE_STATE_RELINQUISHING:
+ case FSCACHE_COOKIE_STATE_WITHDRAWING:
+ if (cookie->cache_priv) {
+ spin_unlock(&cookie->lock);
+ cookie->volume->cache->ops->withdraw_cookie(cookie);
+ spin_lock(&cookie->lock);
+ }
- spin_lock(&cookie->lock);
- if (!hlist_empty(&cookie->backing_objects)) {
- hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
- if (invalidate)
- set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
- clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ switch (state) {
+ case FSCACHE_COOKIE_STATE_RELINQUISHING:
+ fscache_see_cookie(cookie, fscache_cookie_see_relinquish);
+ fscache_unhash_cookie(cookie);
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_DROPPED);
+ wake = true;
+ goto out;
+ case FSCACHE_COOKIE_STATE_LRU_DISCARDING:
+ fscache_see_cookie(cookie, fscache_cookie_see_lru_discard);
+ break;
+ case FSCACHE_COOKIE_STATE_WITHDRAWING:
+ fscache_see_cookie(cookie, fscache_cookie_see_withdraw);
+ break;
+ default:
+ BUG();
}
- } else {
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- awaken = true;
- }
- spin_unlock(&cookie->lock);
- if (awaken)
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
- /* Wait for cessation of activity requiring access to the netfs (when
- * n_active reaches 0). This makes sure outstanding reads and writes
- * have completed.
- */
- if (!atomic_dec_and_test(&cookie->n_active)) {
- wait_var_event(&cookie->n_active,
- !atomic_read(&cookie->n_active));
- }
+ clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags);
+ clear_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags);
+ clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags);
+ clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
+ wake = true;
+ goto again_locked;
- /* Make sure any pending writes are cancelled. */
- if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX)
- fscache_invalidate_writes(cookie);
+ case FSCACHE_COOKIE_STATE_DROPPED:
+ break;
- /* Reset the cookie state if it wasn't relinquished */
- if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
- atomic_inc(&cookie->n_active);
- set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ default:
+ WARN_ONCE(1, "Cookie %x in unexpected state %u\n",
+ cookie->debug_id, state);
+ break;
}
-out_unlock_enable:
- clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+out:
+ spin_unlock(&cookie->lock);
+ if (wake)
+ wake_up_cookie_state(cookie);
_leave("");
}
-EXPORT_SYMBOL(__fscache_disable_cookie);
+
+static void fscache_cookie_worker(struct work_struct *work)
+{
+ struct fscache_cookie *cookie = container_of(work, struct fscache_cookie, work);
+
+ fscache_see_cookie(cookie, fscache_cookie_see_work);
+ fscache_cookie_state_machine(cookie);
+ fscache_put_cookie(cookie, fscache_cookie_put_work);
+}
/*
- * release a cookie back to the cache
- * - the object will be marked as recyclable on disk if retire is true
- * - all dependents of this cookie must have already been unregistered
- * (indices/files/pages)
+ * Wait for the object to become inactive. The cookie's work item will be
+ * scheduled when someone transitions n_accesses to 0 - but if someone's
+ * already done that, schedule it anyway.
*/
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
- const void *aux_data,
- bool retire)
+static void __fscache_withdraw_cookie(struct fscache_cookie *cookie)
{
- fscache_stat(&fscache_n_relinquishes);
- if (retire)
- fscache_stat(&fscache_n_relinquishes_retire);
+ int n_accesses;
+ bool unpinned;
+
+ unpinned = test_and_clear_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags);
+
+ /* Need to read the access count after unpinning */
+ n_accesses = atomic_read(&cookie->n_accesses);
+ if (unpinned)
+ trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref),
+ n_accesses, fscache_access_cache_unpin);
+ if (n_accesses == 0)
+ fscache_queue_cookie(cookie, fscache_cookie_get_end_access);
+}
- if (!cookie) {
- fscache_stat(&fscache_n_relinquishes_null);
- _leave(" [no cookie]");
- return;
- }
+static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie)
+{
+ fscache_see_cookie(cookie, fscache_cookie_see_lru_do_one);
- _enter("%x{%s,%d},%d",
- cookie->debug_id, cookie->def->name,
- atomic_read(&cookie->n_active), retire);
+ spin_lock(&cookie->lock);
+ if (cookie->state != FSCACHE_COOKIE_STATE_ACTIVE ||
+ time_before(jiffies, cookie->unused_at + fscache_lru_cookie_timeout) ||
+ atomic_read(&cookie->n_active) > 0) {
+ spin_unlock(&cookie->lock);
+ fscache_stat(&fscache_n_cookies_lru_removed);
+ } else {
+ set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags);
+ spin_unlock(&cookie->lock);
+ fscache_stat(&fscache_n_cookies_lru_expired);
+ _debug("lru c=%x", cookie->debug_id);
+ __fscache_withdraw_cookie(cookie);
+ }
- trace_fscache_relinquish(cookie, retire);
+ fscache_put_cookie(cookie, fscache_cookie_put_lru);
+}
- /* No further netfs-accessing operations on this cookie permitted */
- if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags))
- BUG();
+static void fscache_cookie_lru_worker(struct work_struct *work)
+{
+ struct fscache_cookie *cookie;
+ unsigned long unused_at;
- __fscache_disable_cookie(cookie, aux_data, retire);
+ spin_lock(&fscache_cookie_lru_lock);
- /* Clear pointers back to the netfs */
- cookie->netfs_data = NULL;
- cookie->def = NULL;
- BUG_ON(!radix_tree_empty(&cookie->stores));
+ while (!list_empty(&fscache_cookie_lru)) {
+ cookie = list_first_entry(&fscache_cookie_lru,
+ struct fscache_cookie, commit_link);
+ unused_at = cookie->unused_at + fscache_lru_cookie_timeout;
+ if (time_before(jiffies, unused_at)) {
+ timer_reduce(&fscache_cookie_lru_timer, unused_at);
+ break;
+ }
- if (cookie->parent) {
- ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0);
- ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
- atomic_dec(&cookie->parent->n_children);
+ list_del_init(&cookie->commit_link);
+ fscache_stat_d(&fscache_n_cookies_lru);
+ spin_unlock(&fscache_cookie_lru_lock);
+ fscache_cookie_lru_do_one(cookie);
+ spin_lock(&fscache_cookie_lru_lock);
}
- /* Dispose of the netfs's link to the cookie */
- fscache_cookie_put(cookie, fscache_cookie_put_relinquish);
+ spin_unlock(&fscache_cookie_lru_lock);
+}
- _leave("");
+static void fscache_cookie_lru_timed_out(struct timer_list *timer)
+{
+ queue_work(fscache_wq, &fscache_cookie_lru_work);
+}
+
+static void fscache_cookie_drop_from_lru(struct fscache_cookie *cookie)
+{
+ bool need_put = false;
+
+ if (!list_empty(&cookie->commit_link)) {
+ spin_lock(&fscache_cookie_lru_lock);
+ if (!list_empty(&cookie->commit_link)) {
+ list_del_init(&cookie->commit_link);
+ fscache_stat_d(&fscache_n_cookies_lru);
+ fscache_stat(&fscache_n_cookies_lru_dropped);
+ need_put = true;
+ }
+ spin_unlock(&fscache_cookie_lru_lock);
+ if (need_put)
+ fscache_put_cookie(cookie, fscache_cookie_put_lru);
+ }
}
-EXPORT_SYMBOL(__fscache_relinquish_cookie);
/*
* Remove a cookie from the hash table.
@@ -851,43 +908,91 @@ static void fscache_unhash_cookie(struct fscache_cookie *cookie)
hlist_bl_lock(h);
hlist_bl_del(&cookie->hash_link);
+ clear_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags);
hlist_bl_unlock(h);
+ fscache_stat(&fscache_n_relinquishes_dropped);
}
+static void fscache_drop_withdraw_cookie(struct fscache_cookie *cookie)
+{
+ fscache_cookie_drop_from_lru(cookie);
+ __fscache_withdraw_cookie(cookie);
+}
+
+/**
+ * fscache_withdraw_cookie - Mark a cookie for withdrawal
+ * @cookie: The cookie to be withdrawn.
+ *
+ * Allow the cache backend to withdraw the backing for a cookie for its own
+ * reasons, even if that cookie is in active use.
+ */
+void fscache_withdraw_cookie(struct fscache_cookie *cookie)
+{
+ set_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags);
+ fscache_drop_withdraw_cookie(cookie);
+}
+EXPORT_SYMBOL(fscache_withdraw_cookie);
+
/*
- * Drop a reference to a cookie.
+ * Allow the netfs to release a cookie back to the cache.
+ * - the object will be marked as recyclable on disk if retire is true
*/
-void fscache_cookie_put(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
{
- struct fscache_cookie *parent;
- int ref;
+ fscache_stat(&fscache_n_relinquishes);
+ if (retire)
+ fscache_stat(&fscache_n_relinquishes_retire);
+
+ _enter("c=%08x{%d},%d",
+ cookie->debug_id, atomic_read(&cookie->n_active), retire);
- _enter("%x", cookie->debug_id);
+ if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
+ "Cookie c=%x already relinquished\n", cookie->debug_id))
+ return;
- do {
- unsigned int cookie_debug_id = cookie->debug_id;
- bool zero = __refcount_dec_and_test(&cookie->ref, &ref);
+ if (retire)
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ trace_fscache_relinquish(cookie, retire);
- trace_fscache_cookie(cookie_debug_id, ref - 1, where);
- if (!zero)
- return;
+ ASSERTCMP(atomic_read(&cookie->n_active), ==, 0);
+ ASSERTCMP(atomic_read(&cookie->volume->n_cookies), >, 0);
+ atomic_dec(&cookie->volume->n_cookies);
- parent = cookie->parent;
+ if (test_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags)) {
+ set_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags);
+ fscache_drop_withdraw_cookie(cookie);
+ } else {
+ fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_DROPPED);
fscache_unhash_cookie(cookie);
- fscache_free_cookie(cookie);
+ }
+ fscache_put_cookie(cookie, fscache_cookie_put_relinquish);
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
- cookie = parent;
- where = fscache_cookie_put_parent;
- } while (cookie);
+/*
+ * Drop a reference to a cookie.
+ */
+void fscache_put_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ struct fscache_volume *volume = cookie->volume;
+ unsigned int cookie_debug_id = cookie->debug_id;
+ bool zero;
+ int ref;
- _leave("");
+ zero = __refcount_dec_and_test(&cookie->ref, &ref);
+ trace_fscache_cookie(cookie_debug_id, ref - 1, where);
+ if (zero) {
+ fscache_free_cookie(cookie);
+ fscache_put_volume(volume, fscache_volume_put_cookie);
+ }
}
+EXPORT_SYMBOL(fscache_put_cookie);
/*
* Get a reference to a cookie.
*/
-struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie,
+struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie,
enum fscache_cookie_trace where)
{
int ref;
@@ -896,86 +1001,75 @@ struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie,
trace_fscache_cookie(cookie->debug_id, ref + 1, where);
return cookie;
}
+EXPORT_SYMBOL(fscache_get_cookie);
/*
- * check the consistency between the netfs inode and the backing cache
- *
- * NOTE: it only serves no-index type
+ * Ask the cache to effect invalidation of a cookie.
*/
-int __fscache_check_consistency(struct fscache_cookie *cookie,
- const void *aux_data)
+static void fscache_perform_invalidation(struct fscache_cookie *cookie)
{
- struct fscache_operation *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
-
- _enter("%p,", cookie);
+ if (!cookie->volume->cache->ops->invalidate_cookie(cookie))
+ fscache_caching_failed(cookie);
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+}
- ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+/*
+ * Invalidate an object.
+ */
+void __fscache_invalidate(struct fscache_cookie *cookie,
+ const void *aux_data, loff_t new_size,
+ unsigned int flags)
+{
+ bool is_caching;
- if (fscache_wait_for_deferred_lookup(cookie) < 0)
- return -ERESTARTSYS;
+ _enter("c=%x", cookie->debug_id);
- if (hlist_empty(&cookie->backing_objects))
- return 0;
+ fscache_stat(&fscache_n_invalidates);
- op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
- if (!op)
- return -ENOMEM;
+ if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
+ "Trying to invalidate relinquished cookie\n"))
+ return;
- fscache_operation_init(cookie, op, NULL, NULL, NULL);
- op->flags = FSCACHE_OP_MYTHREAD |
- (1 << FSCACHE_OP_WAITING) |
- (1 << FSCACHE_OP_UNUSE_COOKIE);
- trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency);
+ if ((flags & FSCACHE_INVAL_DIO_WRITE) &&
+ test_and_set_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags))
+ return;
spin_lock(&cookie->lock);
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ fscache_update_aux(cookie, aux_data, &new_size);
+ cookie->inval_counter++;
+ trace_fscache_invalidate(cookie, new_size);
- fscache_update_aux(cookie, aux_data);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto inconsistent;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
- if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
- goto inconsistent;
-
- op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+ switch (cookie->state) {
+ case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */
+ default:
+ spin_unlock(&cookie->lock);
+ _leave(" [no %u]", cookie->state);
+ return;
- __fscache_use_cookie(cookie);
- if (fscache_submit_op(object, op) < 0)
- goto submit_failed;
+ case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ case FSCACHE_COOKIE_STATE_CREATING:
+ spin_unlock(&cookie->lock);
+ _leave(" [look %x]", cookie->inval_counter);
+ return;
- /* the work queue now carries its own ref on the object */
- spin_unlock(&cookie->lock);
+ case FSCACHE_COOKIE_STATE_ACTIVE:
+ is_caching = fscache_begin_cookie_access(
+ cookie, fscache_access_invalidate_cookie);
+ if (is_caching)
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_INVALIDATING);
+ spin_unlock(&cookie->lock);
+ wake_up_cookie_state(cookie);
- ret = fscache_wait_for_operation_activation(object, op, NULL, NULL);
- if (ret == 0) {
- /* ask the cache to honour the operation */
- ret = object->cache->ops->check_consistency(op);
- fscache_op_complete(op, false);
- } else if (ret == -ENOBUFS) {
- ret = 0;
+ if (is_caching)
+ fscache_queue_cookie(cookie, fscache_cookie_get_inval_work);
+ _leave(" [inv]");
+ return;
}
-
- fscache_put_operation(op);
- _leave(" = %d", ret);
- return ret;
-
-submit_failed:
- wake_cookie = __fscache_unuse_cookie(cookie);
-inconsistent:
- spin_unlock(&cookie->lock);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
- kfree(op);
- _leave(" = -ESTALE");
- return -ESTALE;
}
-EXPORT_SYMBOL(__fscache_check_consistency);
+EXPORT_SYMBOL(__fscache_invalidate);
+#ifdef CONFIG_PROC_FS
/*
* Generate a list of extant cookies in /proc/fs/fscache/cookies
*/
@@ -983,44 +1077,27 @@ static int fscache_cookies_seq_show(struct seq_file *m, void *v)
{
struct fscache_cookie *cookie;
unsigned int keylen = 0, auxlen = 0;
- char _type[3], *type;
u8 *p;
if (v == &fscache_cookies) {
seq_puts(m,
- "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n"
- "======== ======== ===== ===== === == === ================ ==========\n"
+ "COOKIE VOLUME REF ACT ACC S FL DEF \n"
+ "======== ======== === === === = == ================\n"
);
return 0;
}
cookie = list_entry(v, struct fscache_cookie, proc_link);
- switch (cookie->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- snprintf(_type, sizeof(_type), "%02u",
- cookie->type);
- type = _type;
- break;
- }
-
seq_printf(m,
- "%08x %08x %5u %5u %3u %s %03lx %-16s %px",
+ "%08x %08x %3d %3d %3d %c %02lx",
cookie->debug_id,
- cookie->parent ? cookie->parent->debug_id : 0,
+ cookie->volume->debug_id,
refcount_read(&cookie->ref),
- atomic_read(&cookie->n_children),
atomic_read(&cookie->n_active),
- type,
- cookie->flags,
- cookie->def->name,
- cookie->netfs_data);
+ atomic_read(&cookie->n_accesses),
+ fscache_cookie_states[cookie->state],
+ cookie->flags);
keylen = cookie->key_len;
auxlen = cookie->aux_len;
@@ -1069,3 +1146,4 @@ const struct seq_operations fscache_cookies_seq_ops = {
.stop = fscache_cookies_seq_stop,
.show = fscache_cookies_seq_show,
};
+#endif
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
deleted file mode 100644
index 0402673c680e..000000000000
--- a/fs/fscache/fsdef.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Filesystem index definition
- *
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#define FSCACHE_DEBUG_LEVEL CACHE
-#include <linux/module.h>
-#include "internal.h"
-
-static
-enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
- const void *data,
- uint16_t datalen,
- loff_t object_size);
-
-/*
- * The root index is owned by FS-Cache itself.
- *
- * When a netfs requests caching facilities, FS-Cache will, if one doesn't
- * already exist, create an entry in the root index with the key being the name
- * of the netfs ("AFS" for example), and the auxiliary data holding the index
- * structure version supplied by the netfs:
- *
- * FSDEF
- * |
- * +-----------+
- * | |
- * NFS AFS
- * [v=1] [v=1]
- *
- * If an entry with the appropriate name does already exist, the version is
- * compared. If the version is different, the entire subtree from that entry
- * will be discarded and a new entry created.
- *
- * The new entry will be an index, and a cookie referring to it will be passed
- * to the netfs. This is then the root handle by which the netfs accesses the
- * cache. It can create whatever objects it likes in that index, including
- * further indices.
- */
-static struct fscache_cookie_def fscache_fsdef_index_def = {
- .name = ".FS-Cache",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-struct fscache_cookie fscache_fsdef_index = {
- .debug_id = 1,
- .ref = REFCOUNT_INIT(1),
- .n_active = ATOMIC_INIT(1),
- .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
- .backing_objects = HLIST_HEAD_INIT,
- .def = &fscache_fsdef_index_def,
- .flags = 1 << FSCACHE_COOKIE_ENABLED,
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-EXPORT_SYMBOL(fscache_fsdef_index);
-
-/*
- * Definition of an entry in the root index. Each entry is an index, keyed to
- * a specific netfs and only applicable to a particular version of the index
- * structure used by that netfs.
- */
-struct fscache_cookie_def fscache_fsdef_netfs_def = {
- .name = "FSDEF.netfs",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
- .check_aux = fscache_fsdef_netfs_check_aux,
-};
-
-/*
- * check that the index structure version number stored in the auxiliary data
- * matches the one the netfs gave us
- */
-static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
- void *cookie_netfs_data,
- const void *data,
- uint16_t datalen,
- loff_t object_size)
-{
- struct fscache_netfs *netfs = cookie_netfs_data;
- uint32_t version;
-
- _enter("{%s},,%hu", netfs->name, datalen);
-
- if (datalen != sizeof(version)) {
- _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- memcpy(&version, data, sizeof(version));
- if (version != netfs->version) {
- _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- _leave(" = OKAY");
- return FSCACHE_CHECKAUX_OKAY;
-}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index c3e4804b8fcb..1336f517e9b1 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -1,65 +1,71 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal definitions for FS-Cache
*
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
-/*
- * Lock order, in the order in which multiple locks should be obtained:
- * - fscache_addremove_sem
- * - cookie->lock
- * - cookie->parent->lock
- * - cache->object_list_lock
- * - object->lock
- * - object->parent->lock
- * - cookie->stores_lock
- * - fscache_thread_lock
- *
- */
-
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) "FS-Cache: " fmt
+#include <linux/slab.h>
#include <linux/fscache-cache.h>
#include <trace/events/fscache.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
-#define FSCACHE_MIN_THREADS 4
-#define FSCACHE_MAX_THREADS 32
-
/*
* cache.c
*/
-extern struct list_head fscache_cache_list;
-extern struct rw_semaphore fscache_addremove_sem;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
-extern struct fscache_cache *fscache_select_cache_for_object(
- struct fscache_cookie *);
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
/*
* cookie.c
*/
extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
-extern void fscache_free_cookie(struct fscache_cookie *);
-extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
- const struct fscache_cookie_def *,
- const void *, size_t,
- const void *, size_t,
- void *, loff_t);
-extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *);
-extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *,
- enum fscache_cookie_trace);
-extern void fscache_cookie_put(struct fscache_cookie *,
- enum fscache_cookie_trace);
-
-static inline void fscache_cookie_see(struct fscache_cookie *cookie,
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
enum fscache_cookie_trace where)
{
trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
@@ -67,60 +73,11 @@ static inline void fscache_cookie_see(struct fscache_cookie *cookie,
}
/*
- * fsdef.c
- */
-extern struct fscache_cookie fscache_fsdef_index;
-extern struct fscache_cookie_def fscache_fsdef_netfs_def;
-
-/*
* main.c
*/
-extern unsigned fscache_defer_lookup;
-extern unsigned fscache_defer_create;
extern unsigned fscache_debug;
-extern struct kobject *fscache_root;
-extern struct workqueue_struct *fscache_object_wq;
-extern struct workqueue_struct *fscache_op_wq;
-DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
-
-extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n);
-
-static inline bool fscache_object_congested(void)
-{
- return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
-}
-
-/*
- * object.c
- */
-extern void fscache_enqueue_object(struct fscache_object *);
-
-/*
- * operation.c
- */
-extern int fscache_submit_exclusive_op(struct fscache_object *,
- struct fscache_operation *);
-extern int fscache_submit_op(struct fscache_object *,
- struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *, bool);
-extern void fscache_cancel_all_ops(struct fscache_object *);
-extern void fscache_abort_object(struct fscache_object *);
-extern void fscache_start_operations(struct fscache_object *);
-extern void fscache_operation_gc(struct work_struct *);
-/*
- * page.c
- */
-extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
-extern int fscache_wait_for_operation_activation(struct fscache_object *,
- struct fscache_operation *,
- atomic_t *,
- atomic_t *);
-extern void fscache_invalidate_writes(struct fscache_cookie *);
-struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie,
- struct address_space *mapping,
- fscache_rw_complete_t end_io_func,
- void *context);
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
/*
* proc.c
@@ -137,125 +94,27 @@ extern void fscache_proc_cleanup(void);
* stats.c
*/
#ifdef CONFIG_FSCACHE_STATS
-extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
-extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
-
-extern atomic_t fscache_n_op_pend;
-extern atomic_t fscache_n_op_run;
-extern atomic_t fscache_n_op_enqueue;
-extern atomic_t fscache_n_op_deferred_release;
-extern atomic_t fscache_n_op_initialised;
-extern atomic_t fscache_n_op_release;
-extern atomic_t fscache_n_op_gc;
-extern atomic_t fscache_n_op_cancelled;
-extern atomic_t fscache_n_op_rejected;
-
-extern atomic_t fscache_n_attr_changed;
-extern atomic_t fscache_n_attr_changed_ok;
-extern atomic_t fscache_n_attr_changed_nobufs;
-extern atomic_t fscache_n_attr_changed_nomem;
-extern atomic_t fscache_n_attr_changed_calls;
-
-extern atomic_t fscache_n_allocs;
-extern atomic_t fscache_n_allocs_ok;
-extern atomic_t fscache_n_allocs_wait;
-extern atomic_t fscache_n_allocs_nobufs;
-extern atomic_t fscache_n_allocs_intr;
-extern atomic_t fscache_n_allocs_object_dead;
-extern atomic_t fscache_n_alloc_ops;
-extern atomic_t fscache_n_alloc_op_waits;
-
-extern atomic_t fscache_n_retrievals;
-extern atomic_t fscache_n_retrievals_ok;
-extern atomic_t fscache_n_retrievals_wait;
-extern atomic_t fscache_n_retrievals_nodata;
-extern atomic_t fscache_n_retrievals_nobufs;
-extern atomic_t fscache_n_retrievals_intr;
-extern atomic_t fscache_n_retrievals_nomem;
-extern atomic_t fscache_n_retrievals_object_dead;
-extern atomic_t fscache_n_retrieval_ops;
-extern atomic_t fscache_n_retrieval_op_waits;
-
-extern atomic_t fscache_n_stores;
-extern atomic_t fscache_n_stores_ok;
-extern atomic_t fscache_n_stores_again;
-extern atomic_t fscache_n_stores_nobufs;
-extern atomic_t fscache_n_stores_oom;
-extern atomic_t fscache_n_store_ops;
-extern atomic_t fscache_n_store_calls;
-extern atomic_t fscache_n_store_pages;
-extern atomic_t fscache_n_store_radix_deletes;
-extern atomic_t fscache_n_store_pages_over_limit;
-
-extern atomic_t fscache_n_store_vmscan_not_storing;
-extern atomic_t fscache_n_store_vmscan_gone;
-extern atomic_t fscache_n_store_vmscan_busy;
-extern atomic_t fscache_n_store_vmscan_cancelled;
-extern atomic_t fscache_n_store_vmscan_wait;
-
-extern atomic_t fscache_n_marks;
-extern atomic_t fscache_n_uncaches;
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
extern atomic_t fscache_n_acquires;
-extern atomic_t fscache_n_acquires_null;
-extern atomic_t fscache_n_acquires_no_cache;
extern atomic_t fscache_n_acquires_ok;
-extern atomic_t fscache_n_acquires_nobufs;
extern atomic_t fscache_n_acquires_oom;
extern atomic_t fscache_n_invalidates;
-extern atomic_t fscache_n_invalidates_run;
-
-extern atomic_t fscache_n_updates;
-extern atomic_t fscache_n_updates_null;
-extern atomic_t fscache_n_updates_run;
extern atomic_t fscache_n_relinquishes;
-extern atomic_t fscache_n_relinquishes_null;
-extern atomic_t fscache_n_relinquishes_waitcrt;
extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
-extern atomic_t fscache_n_cookie_index;
-extern atomic_t fscache_n_cookie_data;
-extern atomic_t fscache_n_cookie_special;
-
-extern atomic_t fscache_n_object_alloc;
-extern atomic_t fscache_n_object_no_alloc;
-extern atomic_t fscache_n_object_lookups;
-extern atomic_t fscache_n_object_lookups_negative;
-extern atomic_t fscache_n_object_lookups_positive;
-extern atomic_t fscache_n_object_lookups_timed_out;
-extern atomic_t fscache_n_object_created;
-extern atomic_t fscache_n_object_avail;
-extern atomic_t fscache_n_object_dead;
-
-extern atomic_t fscache_n_checkaux_none;
-extern atomic_t fscache_n_checkaux_okay;
-extern atomic_t fscache_n_checkaux_update;
-extern atomic_t fscache_n_checkaux_obsolete;
-
-extern atomic_t fscache_n_cop_alloc_object;
-extern atomic_t fscache_n_cop_lookup_object;
-extern atomic_t fscache_n_cop_lookup_complete;
-extern atomic_t fscache_n_cop_grab_object;
-extern atomic_t fscache_n_cop_invalidate_object;
-extern atomic_t fscache_n_cop_update_object;
-extern atomic_t fscache_n_cop_drop_object;
-extern atomic_t fscache_n_cop_put_object;
-extern atomic_t fscache_n_cop_sync_cache;
-extern atomic_t fscache_n_cop_attr_changed;
-extern atomic_t fscache_n_cop_read_or_alloc_page;
-extern atomic_t fscache_n_cop_read_or_alloc_pages;
-extern atomic_t fscache_n_cop_allocate_page;
-extern atomic_t fscache_n_cop_allocate_pages;
-extern atomic_t fscache_n_cop_write_page;
-extern atomic_t fscache_n_cop_uncache_page;
-extern atomic_t fscache_n_cop_dissociate_pages;
-
-extern atomic_t fscache_n_cache_no_space_reject;
-extern atomic_t fscache_n_cache_stale_objects;
-extern atomic_t fscache_n_cache_retired_objects;
-extern atomic_t fscache_n_cache_culled_objects;
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
static inline void fscache_stat(atomic_t *stat)
{
@@ -278,71 +137,28 @@ int fscache_stats_show(struct seq_file *m, void *v);
#endif
/*
- * raise an event on an object
- * - if the event is not masked for that object, then the object is
- * queued for attention by the thread pool.
+ * volume.c
*/
-static inline void fscache_raise_event(struct fscache_object *object,
- unsigned event)
-{
- BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
-#if 0
- printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
- object->debug_id, object->event_mask, (1 << event));
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
#endif
- if (!test_and_set_bit(event, &object->events) &&
- test_bit(event, &object->event_mask))
- fscache_enqueue_object(object);
-}
-/*
- * get an extra reference to a netfs retrieval context
- */
-static inline
-void *fscache_get_context(struct fscache_cookie *cookie, void *context)
-{
- if (cookie->def->get_context)
- cookie->def->get_context(cookie->netfs_data, context);
- return context;
-}
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
-/*
- * release a reference to a netfs retrieval context
- */
-static inline
-void fscache_put_context(struct fscache_cookie *cookie, void *context)
-{
- if (cookie->def->put_context)
- cookie->def->put_context(cookie->netfs_data, context);
-}
-
-/*
- * Update the auxiliary data on a cookie.
- */
-static inline
-void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data)
-{
- void *p;
-
- if (!aux_data)
- return;
- if (cookie->aux_len <= sizeof(cookie->inline_aux))
- p = cookie->inline_aux;
- else
- p = cookie->aux;
-
- if (memcmp(p, aux_data, cookie->aux_len) != 0) {
- memcpy(p, aux_data, cookie->aux_len);
- set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags);
- }
-}
/*****************************************************************************/
/*
* debug tracing
*/
#define dbgprintk(FMT, ...) \
- printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+ printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
@@ -395,7 +211,7 @@ do { \
#define FSCACHE_DEBUG_CACHE 0
#define FSCACHE_DEBUG_COOKIE 1
-#define FSCACHE_DEBUG_PAGE 2
+#define FSCACHE_DEBUG_OBJECT 2
#define FSCACHE_DEBUG_OPERATION 3
#define FSCACHE_POINT_ENTER 1
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 8ecc1141802f..3af3b08a9bb3 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -4,113 +4,324 @@
* Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
-
-#define FSCACHE_DEBUG_LEVEL PAGE
-#include <linux/module.h>
-#define FSCACHE_USE_NEW_IO_API
+#define FSCACHE_DEBUG_LEVEL OPERATION
#include <linux/fscache-cache.h>
+#include <linux/uio.h>
+#include <linux/bvec.h>
#include <linux/slab.h>
-#include <linux/netfs.h>
+#include <linux/uio.h>
#include "internal.h"
-/*
- * Start a cache read operation.
- * - we return:
- * -ENOMEM - out of memory, some pages may be being read
- * -ERESTARTSYS - interrupted, some pages may be being read
- * -ENOBUFS - no backing object or space available in which to cache any
- * pages not being read
- * -ENODATA - no data available in the backing object for some or all of
- * the pages
- * 0 - dispatched a read on all pages
+/**
+ * fscache_wait_for_operation - Wait for an object become accessible
+ * @cres: The cache resources for the operation being performed
+ * @want_state: The minimum state the object must be at
+ *
+ * See if the target cache object is at the specified minimum state of
+ * accessibility yet, and if not, wait for it.
*/
-int __fscache_begin_read_operation(struct netfs_read_request *rreq,
- struct fscache_cookie *cookie)
+bool fscache_wait_for_operation(struct netfs_cache_resources *cres,
+ enum fscache_want_state want_state)
{
- struct fscache_retrieval *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
+ struct fscache_cookie *cookie = fscache_cres_cookie(cres);
+ enum fscache_cookie_state state;
- _enter("rr=%08x", rreq->debug_id);
+again:
+ if (!fscache_cache_is_live(cookie->volume->cache)) {
+ _leave(" [broken]");
+ return false;
+ }
- fscache_stat(&fscache_n_retrievals);
+ state = fscache_cookie_state(cookie);
+ _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
- if (hlist_empty(&cookie->backing_objects))
- goto nobufs;
+ switch (state) {
+ case FSCACHE_COOKIE_STATE_CREATING:
+ case FSCACHE_COOKIE_STATE_INVALIDATING:
+ if (want_state == FSCACHE_WANT_PARAMS)
+ goto ready; /* There can be no content */
+ fallthrough;
+ case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ case FSCACHE_COOKIE_STATE_LRU_DISCARDING:
+ wait_var_event(&cookie->state,
+ fscache_cookie_state(cookie) != state);
+ goto again;
- if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
- _leave(" = -ENOBUFS [invalidating]");
- return -ENOBUFS;
+ case FSCACHE_COOKIE_STATE_ACTIVE:
+ goto ready;
+ case FSCACHE_COOKIE_STATE_DROPPED:
+ case FSCACHE_COOKIE_STATE_RELINQUISHING:
+ default:
+ _leave(" [not live]");
+ return false;
}
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ready:
+ if (!cres->cache_priv2)
+ return cookie->volume->cache->ops->begin_operation(cres, want_state);
+ return true;
+}
+EXPORT_SYMBOL(fscache_wait_for_operation);
+
+/*
+ * Begin an I/O operation on the cache, waiting till we reach the right state.
+ *
+ * Attaches the resources required to the operation resources record.
+ */
+static int fscache_begin_operation(struct netfs_cache_resources *cres,
+ struct fscache_cookie *cookie,
+ enum fscache_want_state want_state,
+ enum fscache_access_trace why)
+{
+ enum fscache_cookie_state state;
+ long timeo;
+ bool once_only = false;
- if (fscache_wait_for_deferred_lookup(cookie) < 0)
- return -ERESTARTSYS;
+ cres->ops = NULL;
+ cres->cache_priv = cookie;
+ cres->cache_priv2 = NULL;
+ cres->debug_id = cookie->debug_id;
+ cres->inval_counter = cookie->inval_counter;
- op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL);
- if (!op)
- return -ENOMEM;
- trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
+ if (!fscache_begin_cookie_access(cookie, why))
+ return -ENOBUFS;
+again:
spin_lock(&cookie->lock);
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs_unlock;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
+ state = fscache_cookie_state(cookie);
+ _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
- __fscache_use_cookie(cookie);
- atomic_inc(&object->n_reads);
- __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+ switch (state) {
+ case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ case FSCACHE_COOKIE_STATE_LRU_DISCARDING:
+ case FSCACHE_COOKIE_STATE_INVALIDATING:
+ goto wait_for_file_wrangling;
+ case FSCACHE_COOKIE_STATE_CREATING:
+ if (want_state == FSCACHE_WANT_PARAMS)
+ goto ready; /* There can be no content */
+ goto wait_for_file_wrangling;
+ case FSCACHE_COOKIE_STATE_ACTIVE:
+ goto ready;
+ case FSCACHE_COOKIE_STATE_DROPPED:
+ case FSCACHE_COOKIE_STATE_RELINQUISHING:
+ WARN(1, "Can't use cookie in state %u\n", cookie->state);
+ goto not_live;
+ default:
+ goto not_live;
+ }
- if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock_dec;
+ready:
spin_unlock(&cookie->lock);
+ if (!cookie->volume->cache->ops->begin_operation(cres, want_state))
+ goto failed;
+ return 0;
- fscache_stat(&fscache_n_retrieval_ops);
+wait_for_file_wrangling:
+ spin_unlock(&cookie->lock);
+ trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref),
+ atomic_read(&cookie->n_accesses),
+ fscache_access_io_wait);
+ timeo = wait_var_event_timeout(&cookie->state,
+ fscache_cookie_state(cookie) != state, 20 * HZ);
+ if (timeo <= 1 && !once_only) {
+ pr_warn("%s: cookie state change wait timed out: cookie->state=%u state=%u",
+ __func__, fscache_cookie_state(cookie), state);
+ fscache_print_cookie(cookie, 'O');
+ once_only = true;
+ }
+ goto again;
- /* we wait for the operation to become active, and then process it
- * *here*, in this thread, and not in the thread pool */
- ret = fscache_wait_for_operation_activation(
- object, &op->op,
- __fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead));
- if (ret < 0)
- goto error;
-
- /* ask the cache to honour the operation */
- ret = object->cache->ops->begin_read_operation(rreq, op);
-
-error:
- if (ret == -ENOMEM)
- fscache_stat(&fscache_n_retrievals_nomem);
- else if (ret == -ERESTARTSYS)
- fscache_stat(&fscache_n_retrievals_intr);
- else if (ret == -ENODATA)
- fscache_stat(&fscache_n_retrievals_nodata);
- else if (ret < 0)
- fscache_stat(&fscache_n_retrievals_nobufs);
- else
- fscache_stat(&fscache_n_retrievals_ok);
-
- fscache_put_retrieval(op);
- _leave(" = %d", ret);
- return ret;
-
-nobufs_unlock_dec:
- atomic_dec(&object->n_reads);
- wake_cookie = __fscache_unuse_cookie(cookie);
-nobufs_unlock:
+not_live:
spin_unlock(&cookie->lock);
- fscache_put_retrieval(op);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
-nobufs:
- fscache_stat(&fscache_n_retrievals_nobufs);
+failed:
+ cres->cache_priv = NULL;
+ cres->ops = NULL;
+ fscache_end_cookie_access(cookie, fscache_access_io_not_live);
_leave(" = -ENOBUFS");
return -ENOBUFS;
}
+
+int __fscache_begin_read_operation(struct netfs_cache_resources *cres,
+ struct fscache_cookie *cookie)
+{
+ return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS,
+ fscache_access_io_read);
+}
EXPORT_SYMBOL(__fscache_begin_read_operation);
+
+int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
+ struct fscache_cookie *cookie)
+{
+ return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS,
+ fscache_access_io_write);
+}
+EXPORT_SYMBOL(__fscache_begin_write_operation);
+
+/**
+ * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ * @cookie: The cookie referring to the cache object
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory
+ * so that writeback can later write to it. This is intended
+ * to be called from the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
+ struct fscache_cookie *cookie)
+{
+ struct inode *inode = mapping->host;
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ inode->i_state |= I_PINNING_FSCACHE_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(fscache_dirty_folio);
+
+struct fscache_write_request {
+ struct netfs_cache_resources cache_resources;
+ struct address_space *mapping;
+ loff_t start;
+ size_t len;
+ bool set_bits;
+ netfs_io_terminated_t term_func;
+ void *term_func_priv;
+};
+
+void __fscache_clear_page_bits(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ pgoff_t first = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE;
+ struct page *page;
+
+ if (len) {
+ XA_STATE(xas, &mapping->i_pages, first);
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last) {
+ end_page_fscache(page);
+ }
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(__fscache_clear_page_bits);
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct fscache_write_request *wreq = priv;
+
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+ wreq->set_bits);
+
+ if (wreq->term_func)
+ wreq->term_func(wreq->term_func_priv, transferred_or_error,
+ was_async);
+ fscache_end_operation(&wreq->cache_resources);
+ kfree(wreq);
+}
+
+void __fscache_write_to_cache(struct fscache_cookie *cookie,
+ struct address_space *mapping,
+ loff_t start, size_t len, loff_t i_size,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv,
+ bool cond)
+{
+ struct fscache_write_request *wreq;
+ struct netfs_cache_resources *cres;
+ struct iov_iter iter;
+ int ret = -ENOBUFS;
+
+ if (len == 0)
+ goto abandon;
+
+ _enter("%llx,%zx", start, len);
+
+ wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS);
+ if (!wreq)
+ goto abandon;
+ wreq->mapping = mapping;
+ wreq->start = start;
+ wreq->len = len;
+ wreq->set_bits = cond;
+ wreq->term_func = term_func;
+ wreq->term_func_priv = term_func_priv;
+
+ cres = &wreq->cache_resources;
+ if (fscache_begin_operation(cres, cookie, FSCACHE_WANT_WRITE,
+ fscache_access_io_write) < 0)
+ goto abandon_free;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ if (ret < 0)
+ goto abandon_end;
+
+ /* TODO: Consider clearing page bits now for space the write isn't
+ * covering. This is more complicated than it appears when THPs are
+ * taken into account.
+ */
+
+ iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ fscache_write(cres, start, &iter, fscache_wreq_done, wreq);
+ return;
+
+abandon_end:
+ return fscache_wreq_done(wreq, ret, false);
+abandon_free:
+ kfree(wreq);
+abandon:
+ fscache_clear_page_bits(mapping, start, len, cond);
+ if (term_func)
+ term_func(term_func_priv, ret, false);
+}
+EXPORT_SYMBOL(__fscache_write_to_cache);
+
+/*
+ * Change the size of a backing object.
+ */
+void __fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size)
+{
+ struct netfs_cache_resources cres;
+
+ trace_fscache_resize(cookie, new_size);
+ if (fscache_begin_operation(&cres, cookie, FSCACHE_WANT_WRITE,
+ fscache_access_io_resize) == 0) {
+ fscache_stat(&fscache_n_resizes);
+ set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags);
+
+ /* We cannot defer a resize as we need to do it inside the
+ * netfs's inode lock so that we're serialised with respect to
+ * writes.
+ */
+ cookie->volume->cache->ops->resize_cookie(&cres, new_size);
+ fscache_end_operation(&cres);
+ } else {
+ fscache_stat(&fscache_n_resizes_null);
+ }
+}
+EXPORT_SYMBOL(__fscache_resize_cookie);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4207f98e405f..dad85fd84f6f 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -1,17 +1,13 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* General filesystem local caching manager
*
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <linux/seq_file.h>
#define CREATE_TRACE_POINTS
#include "internal.h"
@@ -19,79 +15,18 @@ MODULE_DESCRIPTION("FS Cache Manager");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
-unsigned fscache_defer_lookup = 1;
-module_param_named(defer_lookup, fscache_defer_lookup, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_defer_lookup,
- "Defer cookie lookup to background thread");
-
-unsigned fscache_defer_create = 1;
-module_param_named(defer_create, fscache_defer_create, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_defer_create,
- "Defer cookie creation to background thread");
-
unsigned fscache_debug;
module_param_named(debug, fscache_debug, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(fscache_debug,
"FS-Cache debugging mask");
-struct kobject *fscache_root;
-struct workqueue_struct *fscache_object_wq;
-struct workqueue_struct *fscache_op_wq;
-
-DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
+EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
+EXPORT_TRACEPOINT_SYMBOL(fscache_access);
-/* these values serve as lower bounds, will be adjusted in fscache_init() */
-static unsigned fscache_object_max_active = 4;
-static unsigned fscache_op_max_active = 2;
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *fscache_sysctl_header;
-
-static int fscache_max_active_sysctl(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- struct workqueue_struct **wqp = table->extra1;
- unsigned int *datap = table->data;
- int ret;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret == 0)
- workqueue_set_max_active(*wqp, *datap);
- return ret;
-}
-
-static struct ctl_table fscache_sysctls[] = {
- {
- .procname = "object_max_active",
- .data = &fscache_object_max_active,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = fscache_max_active_sysctl,
- .extra1 = &fscache_object_wq,
- },
- {
- .procname = "operation_max_active",
- .data = &fscache_op_max_active,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = fscache_max_active_sysctl,
- .extra1 = &fscache_op_wq,
- },
- {}
-};
-
-static struct ctl_table fscache_sysctls_root[] = {
- {
- .procname = "fscache",
- .mode = 0555,
- .child = fscache_sysctls,
- },
- {}
-};
-#endif
+struct workqueue_struct *fscache_wq;
+EXPORT_SYMBOL(fscache_wq);
/*
* Mixing scores (in bits) for (7,20):
@@ -118,15 +53,16 @@ static inline unsigned int fold_hash(unsigned long x, unsigned long y)
/*
* Generate a hash. This is derived from full_name_hash(), but we want to be
* sure it is arch independent and that it doesn't change as bits of the
- * computed hash value might appear on disk. The caller also guarantees that
- * the hashed data will be a series of aligned 32-bit words.
+ * computed hash value might appear on disk. The caller must guarantee that
+ * the source data is a multiple of four bytes in size.
*/
-unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n)
+unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
{
- unsigned int a, x = 0, y = salt;
+ const __le32 *p = data;
+ unsigned int a, x = 0, y = salt, n = len / sizeof(__le32);
for (; n; n--) {
- a = *data++;
+ a = le32_to_cpu(*p++);
HASH_MIX(x, y, a);
}
return fold_hash(x, y);
@@ -137,44 +73,16 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n)
*/
static int __init fscache_init(void)
{
- unsigned int nr_cpus = num_possible_cpus();
- unsigned int cpu;
- int ret;
-
- fscache_object_max_active =
- clamp_val(nr_cpus,
- fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
-
- ret = -ENOMEM;
- fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
- fscache_object_max_active);
- if (!fscache_object_wq)
- goto error_object_wq;
-
- fscache_op_max_active =
- clamp_val(fscache_object_max_active / 2,
- fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+ int ret = -ENOMEM;
- ret = -ENOMEM;
- fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
- fscache_op_max_active);
- if (!fscache_op_wq)
- goto error_op_wq;
-
- for_each_possible_cpu(cpu)
- init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+ fscache_wq = alloc_workqueue("fscache", WQ_UNBOUND | WQ_FREEZABLE, 0);
+ if (!fscache_wq)
+ goto error_wq;
ret = fscache_proc_init();
if (ret < 0)
goto error_proc;
-#ifdef CONFIG_SYSCTL
- ret = -ENOMEM;
- fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
- if (!fscache_sysctl_header)
- goto error_sysctl;
-#endif
-
fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
sizeof(struct fscache_cookie),
0, 0, NULL);
@@ -184,26 +92,14 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- fscache_root = kobject_create_and_add("fscache", kernel_kobj);
- if (!fscache_root)
- goto error_kobj;
-
pr_notice("Loaded\n");
return 0;
-error_kobj:
- kmem_cache_destroy(fscache_cookie_jar);
error_cookie_jar:
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(fscache_sysctl_header);
-error_sysctl:
-#endif
fscache_proc_cleanup();
error_proc:
- destroy_workqueue(fscache_op_wq);
-error_op_wq:
- destroy_workqueue(fscache_object_wq);
-error_object_wq:
+ destroy_workqueue(fscache_wq);
+error_wq:
return ret;
}
@@ -216,14 +112,9 @@ static void __exit fscache_exit(void)
{
_enter("");
- kobject_put(fscache_root);
kmem_cache_destroy(fscache_cookie_jar);
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(fscache_sysctl_header);
-#endif
fscache_proc_cleanup();
- destroy_workqueue(fscache_op_wq);
- destroy_workqueue(fscache_object_wq);
+ destroy_workqueue(fscache_wq);
pr_notice("Unloaded\n");
}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
deleted file mode 100644
index d6bdb7b5e723..000000000000
--- a/fs/fscache/netfs.c
+++ /dev/null
@@ -1,74 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* FS-Cache netfs (client) registration
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#define FSCACHE_DEBUG_LEVEL COOKIE
-#include <linux/module.h>
-#include <linux/slab.h>
-#include "internal.h"
-
-/*
- * register a network filesystem for caching
- */
-int __fscache_register_netfs(struct fscache_netfs *netfs)
-{
- struct fscache_cookie *candidate, *cookie;
-
- _enter("{%s}", netfs->name);
-
- /* allocate a cookie for the primary index */
- candidate = fscache_alloc_cookie(&fscache_fsdef_index,
- &fscache_fsdef_netfs_def,
- netfs->name, strlen(netfs->name),
- &netfs->version, sizeof(netfs->version),
- netfs, 0);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
-
- candidate->flags = 1 << FSCACHE_COOKIE_ENABLED;
-
- /* check the netfs type is not already present */
- cookie = fscache_hash_cookie(candidate);
- if (!cookie)
- goto already_registered;
- if (cookie != candidate) {
- trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard);
- fscache_free_cookie(candidate);
- }
-
- fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs);
- atomic_inc(&cookie->parent->n_children);
-
- netfs->primary_index = cookie;
-
- pr_notice("Netfs '%s' registered for caching\n", netfs->name);
- trace_fscache_netfs(netfs);
- _leave(" = 0");
- return 0;
-
-already_registered:
- fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs);
- _leave(" = -EEXIST");
- return -EEXIST;
-}
-EXPORT_SYMBOL(__fscache_register_netfs);
-
-/*
- * unregister a network filesystem from the cache
- * - all cookies must have been released first
- */
-void __fscache_unregister_netfs(struct fscache_netfs *netfs)
-{
- _enter("{%s.%u}", netfs->name, netfs->version);
-
- fscache_relinquish_cookie(netfs->primary_index, NULL, false);
- pr_notice("Netfs '%s' unregistered from caching\n", netfs->name);
-
- _leave("");
-}
-EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
deleted file mode 100644
index 6a675652129b..000000000000
--- a/fs/fscache/object.c
+++ /dev/null
@@ -1,1125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* FS-Cache object state machine handler
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * See Documentation/filesystems/caching/object.rst for a description of the
- * object state machine and the in-kernel representations.
- */
-
-#define FSCACHE_DEBUG_LEVEL COOKIE
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/prefetch.h>
-#include "internal.h"
-
-static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
-static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
-static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
-static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
-static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
-static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
-static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
-static const struct fscache_state *fscache_object_dead(struct fscache_object *, int);
-
-#define __STATE_NAME(n) fscache_osm_##n
-#define STATE(n) (&__STATE_NAME(n))
-
-/*
- * Define a work state. Work states are execution states. No event processing
- * is performed by them. The function attached to a work state returns a
- * pointer indicating the next state to which the state machine should
- * transition. Returning NO_TRANSIT repeats the current state, but goes back
- * to the scheduler first.
- */
-#define WORK_STATE(n, sn, f) \
- const struct fscache_state __STATE_NAME(n) = { \
- .name = #n, \
- .short_name = sn, \
- .work = f \
- }
-
-/*
- * Returns from work states.
- */
-#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
-
-#define NO_TRANSIT ((struct fscache_state *)NULL)
-
-/*
- * Define a wait state. Wait states are event processing states. No execution
- * is performed by them. Wait states are just tables of "if event X occurs,
- * clear it and transition to state Y". The dispatcher returns to the
- * scheduler if none of the events in which the wait state has an interest are
- * currently pending.
- */
-#define WAIT_STATE(n, sn, ...) \
- const struct fscache_state __STATE_NAME(n) = { \
- .name = #n, \
- .short_name = sn, \
- .work = NULL, \
- .transitions = { __VA_ARGS__, { 0, NULL } } \
- }
-
-#define TRANSIT_TO(state, emask) \
- { .events = (emask), .transit_to = STATE(state) }
-
-/*
- * The object state machine.
- */
-static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
-static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
-static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
-static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
-static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
-static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
-
-static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
-static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
-
-static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
-static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
-static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
-static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
-static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead);
-
-static WAIT_STATE(WAIT_FOR_INIT, "?INI",
- TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
-
-static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
- TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
-
-static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
- TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
- TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
- TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
-
-static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
- TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
-
-/*
- * Out-of-band event transition tables. These are for handling unexpected
- * events, such as an I/O error. If an OOB event occurs, the state machine
- * clears and disables the event and forces a transition to the nominated work
- * state (acurrently executing work states will complete first).
- *
- * In such a situation, object->state remembers the state the machine should
- * have been in/gone to and returning NO_TRANSIT returns to that.
- */
-static const struct fscache_transition fscache_osm_init_oob[] = {
- TRANSIT_TO(ABORT_INIT,
- (1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_KILL)),
- { 0, NULL }
-};
-
-static const struct fscache_transition fscache_osm_lookup_oob[] = {
- TRANSIT_TO(LOOKUP_FAILURE,
- (1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_KILL)),
- { 0, NULL }
-};
-
-static const struct fscache_transition fscache_osm_run_oob[] = {
- TRANSIT_TO(KILL_OBJECT,
- (1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_KILL)),
- { 0, NULL }
-};
-
-static int fscache_get_object(struct fscache_object *,
- enum fscache_obj_ref_trace);
-static void fscache_put_object(struct fscache_object *,
- enum fscache_obj_ref_trace);
-static bool fscache_enqueue_dependents(struct fscache_object *, int);
-static void fscache_dequeue_object(struct fscache_object *);
-static void fscache_update_aux_data(struct fscache_object *);
-
-/*
- * we need to notify the parent when an op completes that we had outstanding
- * upon it
- */
-static inline void fscache_done_parent_op(struct fscache_object *object)
-{
- struct fscache_object *parent = object->parent;
-
- _enter("OBJ%x {OBJ%x,%x}",
- object->debug_id, parent->debug_id, parent->n_ops);
-
- spin_lock_nested(&parent->lock, 1);
- parent->n_obj_ops--;
- parent->n_ops--;
- if (parent->n_ops == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
-}
-
-/*
- * Object state machine dispatcher.
- */
-static void fscache_object_sm_dispatcher(struct fscache_object *object)
-{
- const struct fscache_transition *t;
- const struct fscache_state *state, *new_state;
- unsigned long events, event_mask;
- bool oob;
- int event = -1;
-
- ASSERT(object != NULL);
-
- _enter("{OBJ%x,%s,%lx}",
- object->debug_id, object->state->name, object->events);
-
- event_mask = object->event_mask;
-restart:
- object->event_mask = 0; /* Mask normal event handling */
- state = object->state;
-restart_masked:
- events = object->events;
-
- /* Handle any out-of-band events (typically an error) */
- if (events & object->oob_event_mask) {
- _debug("{OBJ%x} oob %lx",
- object->debug_id, events & object->oob_event_mask);
- oob = true;
- for (t = object->oob_table; t->events; t++) {
- if (events & t->events) {
- state = t->transit_to;
- ASSERT(state->work != NULL);
- event = fls(events & t->events) - 1;
- __clear_bit(event, &object->oob_event_mask);
- clear_bit(event, &object->events);
- goto execute_work_state;
- }
- }
- }
- oob = false;
-
- /* Wait states are just transition tables */
- if (!state->work) {
- if (events & event_mask) {
- for (t = state->transitions; t->events; t++) {
- if (events & t->events) {
- new_state = t->transit_to;
- event = fls(events & t->events) - 1;
- trace_fscache_osm(object, state,
- true, false, event);
- clear_bit(event, &object->events);
- _debug("{OBJ%x} ev %d: %s -> %s",
- object->debug_id, event,
- state->name, new_state->name);
- object->state = state = new_state;
- goto execute_work_state;
- }
- }
-
- /* The event mask didn't include all the tabled bits */
- BUG();
- }
- /* Randomly woke up */
- goto unmask_events;
- }
-
-execute_work_state:
- _debug("{OBJ%x} exec %s", object->debug_id, state->name);
-
- trace_fscache_osm(object, state, false, oob, event);
- new_state = state->work(object, event);
- event = -1;
- if (new_state == NO_TRANSIT) {
- _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
- if (unlikely(state == STATE(OBJECT_DEAD))) {
- _leave(" [dead]");
- return;
- }
- fscache_enqueue_object(object);
- event_mask = object->oob_event_mask;
- goto unmask_events;
- }
-
- _debug("{OBJ%x} %s -> %s",
- object->debug_id, state->name, new_state->name);
- object->state = state = new_state;
-
- if (state->work) {
- if (unlikely(state == STATE(OBJECT_DEAD))) {
- _leave(" [dead]");
- return;
- }
- goto restart_masked;
- }
-
- /* Transited to wait state */
- event_mask = object->oob_event_mask;
- for (t = state->transitions; t->events; t++)
- event_mask |= t->events;
-
-unmask_events:
- object->event_mask = event_mask;
- smp_mb();
- events = object->events;
- if (events & event_mask)
- goto restart;
- _leave(" [msk %lx]", event_mask);
-}
-
-/*
- * execute an object
- */
-static void fscache_object_work_func(struct work_struct *work)
-{
- struct fscache_object *object =
- container_of(work, struct fscache_object, work);
-
- _enter("{OBJ%x}", object->debug_id);
-
- fscache_object_sm_dispatcher(object);
- fscache_put_object(object, fscache_obj_put_work);
-}
-
-/**
- * fscache_object_init - Initialise a cache object description
- * @object: Object description
- * @cookie: Cookie object will be attached to
- * @cache: Cache in which backing object will be found
- *
- * Initialise a cache object description to its basic values.
- *
- * See Documentation/filesystems/caching/backend-api.rst for a complete
- * description.
- */
-void fscache_object_init(struct fscache_object *object,
- struct fscache_cookie *cookie,
- struct fscache_cache *cache)
-{
- const struct fscache_transition *t;
-
- atomic_inc(&cache->object_count);
-
- object->state = STATE(WAIT_FOR_INIT);
- object->oob_table = fscache_osm_init_oob;
- object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
- spin_lock_init(&object->lock);
- INIT_LIST_HEAD(&object->cache_link);
- INIT_HLIST_NODE(&object->cookie_link);
- INIT_WORK(&object->work, fscache_object_work_func);
- INIT_LIST_HEAD(&object->dependents);
- INIT_LIST_HEAD(&object->dep_link);
- INIT_LIST_HEAD(&object->pending_ops);
- object->n_children = 0;
- object->n_ops = object->n_in_progress = object->n_exclusive = 0;
- object->events = 0;
- object->store_limit = 0;
- object->store_limit_l = 0;
- object->cache = cache;
- object->cookie = cookie;
- fscache_cookie_get(cookie, fscache_cookie_get_attach_object);
- object->parent = NULL;
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
- RB_CLEAR_NODE(&object->objlist_link);
-#endif
-
- object->oob_event_mask = 0;
- for (t = object->oob_table; t->events; t++)
- object->oob_event_mask |= t->events;
- object->event_mask = object->oob_event_mask;
- for (t = object->state->transitions; t->events; t++)
- object->event_mask |= t->events;
-}
-EXPORT_SYMBOL(fscache_object_init);
-
-/*
- * Mark the object as no longer being live, making sure that we synchronise
- * against op submission.
- */
-static inline void fscache_mark_object_dead(struct fscache_object *object)
-{
- spin_lock(&object->lock);
- clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
- spin_unlock(&object->lock);
-}
-
-/*
- * Abort object initialisation before we start it.
- */
-static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- object->oob_event_mask = 0;
- fscache_dequeue_object(object);
- return transit_to(KILL_OBJECT);
-}
-
-/*
- * initialise an object
- * - check the specified object's parent to see if we can make use of it
- * immediately to do a creation
- * - we may need to start the process of creating a parent and we need to wait
- * for the parent's lookup and creation to complete if it's not there yet
- */
-static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
- int event)
-{
- struct fscache_object *parent;
- bool success;
-
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- ASSERT(list_empty(&object->dep_link));
-
- parent = object->parent;
- if (!parent) {
- _leave(" [no parent]");
- return transit_to(DROP_OBJECT);
- }
-
- _debug("parent: %s of:%lx", parent->state->name, parent->flags);
-
- if (fscache_object_is_dying(parent)) {
- _leave(" [bad parent]");
- return transit_to(DROP_OBJECT);
- }
-
- if (fscache_object_is_available(parent)) {
- _leave(" [ready]");
- return transit_to(PARENT_READY);
- }
-
- _debug("wait");
-
- spin_lock(&parent->lock);
- fscache_stat(&fscache_n_cop_grab_object);
- success = false;
- if (fscache_object_is_live(parent) &&
- object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) {
- list_add(&object->dep_link, &parent->dependents);
- success = true;
- }
- fscache_stat_d(&fscache_n_cop_grab_object);
- spin_unlock(&parent->lock);
- if (!success) {
- _leave(" [grab failed]");
- return transit_to(DROP_OBJECT);
- }
-
- /* fscache_acquire_non_index_cookie() uses this
- * to wake the chain up */
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
- _leave(" [wait]");
- return transit_to(WAIT_FOR_PARENT);
-}
-
-/*
- * Once the parent object is ready, we should kick off our lookup op.
- */
-static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
- int event)
-{
- struct fscache_object *parent = object->parent;
-
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- ASSERT(parent != NULL);
-
- spin_lock(&parent->lock);
- parent->n_ops++;
- parent->n_obj_ops++;
- spin_unlock(&parent->lock);
-
- _leave("");
- return transit_to(LOOK_UP_OBJECT);
-}
-
-/*
- * look an object up in the cache from which it was allocated
- * - we hold an "access lock" on the parent object, so the parent object cannot
- * be withdrawn by either party till we've finished
- */
-static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
- int event)
-{
- struct fscache_cookie *cookie = object->cookie;
- struct fscache_object *parent = object->parent;
- int ret;
-
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- object->oob_table = fscache_osm_lookup_oob;
-
- ASSERT(parent != NULL);
- ASSERTCMP(parent->n_ops, >, 0);
- ASSERTCMP(parent->n_obj_ops, >, 0);
-
- /* make sure the parent is still available */
- ASSERT(fscache_object_is_available(parent));
-
- if (fscache_object_is_dying(parent) ||
- test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
- !fscache_use_cookie(object)) {
- _leave(" [unavailable]");
- return transit_to(LOOKUP_FAILURE);
- }
-
- _debug("LOOKUP \"%s\" in \"%s\"",
- cookie->def->name, object->cache->tag->name);
-
- fscache_stat(&fscache_n_object_lookups);
- fscache_stat(&fscache_n_cop_lookup_object);
- ret = object->cache->ops->lookup_object(object);
- fscache_stat_d(&fscache_n_cop_lookup_object);
-
- fscache_unuse_cookie(object);
-
- if (ret == -ETIMEDOUT) {
- /* probably stuck behind another object, so move this one to
- * the back of the queue */
- fscache_stat(&fscache_n_object_lookups_timed_out);
- _leave(" [timeout]");
- return NO_TRANSIT;
- }
-
- if (ret < 0) {
- _leave(" [error]");
- return transit_to(LOOKUP_FAILURE);
- }
-
- _leave(" [ok]");
- return transit_to(OBJECT_AVAILABLE);
-}
-
-/**
- * fscache_object_lookup_negative - Note negative cookie lookup
- * @object: Object pointing to cookie to mark
- *
- * Note negative lookup, permitting those waiting to read data from an already
- * existing backing object to continue as there's no data for them to read.
- */
-void fscache_object_lookup_negative(struct fscache_object *object)
-{
- struct fscache_cookie *cookie = object->cookie;
-
- _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-
- if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
- fscache_stat(&fscache_n_object_lookups_negative);
-
- /* Allow write requests to begin stacking up and read requests to begin
- * returning ENODATA.
- */
- set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
-
- clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- }
- _leave("");
-}
-EXPORT_SYMBOL(fscache_object_lookup_negative);
-
-/**
- * fscache_obtained_object - Note successful object lookup or creation
- * @object: Object pointing to cookie to mark
- *
- * Note successful lookup and/or creation, permitting those waiting to write
- * data to a backing object to continue.
- *
- * Note that after calling this, an object's cookie may be relinquished by the
- * netfs, and so must be accessed with object lock held.
- */
-void fscache_obtained_object(struct fscache_object *object)
-{
- struct fscache_cookie *cookie = object->cookie;
-
- _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-
- /* if we were still looking up, then we must have a positive lookup
- * result, in which case there may be data available */
- if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
- fscache_stat(&fscache_n_object_lookups_positive);
-
- /* We do (presumably) have data */
- clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
-
- /* Allow write requests to begin stacking up and read requests
- * to begin shovelling data.
- */
- clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- } else {
- fscache_stat(&fscache_n_object_created);
- }
-
- set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
- _leave("");
-}
-EXPORT_SYMBOL(fscache_obtained_object);
-
-/*
- * handle an object that has just become available
- */
-static const struct fscache_state *fscache_object_available(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- object->oob_table = fscache_osm_run_oob;
-
- spin_lock(&object->lock);
-
- fscache_done_parent_op(object);
- if (object->n_in_progress == 0) {
- if (object->n_ops > 0) {
- ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
- fscache_start_operations(object);
- } else {
- ASSERT(list_empty(&object->pending_ops));
- }
- }
- spin_unlock(&object->lock);
-
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- fscache_stat(&fscache_n_object_avail);
-
- _leave("");
- return transit_to(JUMPSTART_DEPS);
-}
-
-/*
- * Wake up this object's dependent objects now that we've become available.
- */
-static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
- return NO_TRANSIT; /* Not finished; requeue */
- return transit_to(WAIT_FOR_CMD);
-}
-
-/*
- * Handle lookup or creation failute.
- */
-static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
- int event)
-{
- struct fscache_cookie *cookie;
-
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- object->oob_event_mask = 0;
-
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags);
-
- cookie = object->cookie;
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-
- fscache_done_parent_op(object);
- return transit_to(KILL_OBJECT);
-}
-
-/*
- * Wait for completion of all active operations on this object and the death of
- * all child objects of this object.
- */
-static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x,%d,%d},%d",
- object->debug_id, object->n_ops, object->n_children, event);
-
- fscache_mark_object_dead(object);
- object->oob_event_mask = 0;
-
- if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) {
- /* Reject any new read/write ops and abort any that are pending. */
- clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
- fscache_cancel_all_ops(object);
- }
-
- if (list_empty(&object->dependents) &&
- object->n_ops == 0 &&
- object->n_children == 0)
- return transit_to(DROP_OBJECT);
-
- if (object->n_in_progress == 0) {
- spin_lock(&object->lock);
- if (object->n_ops > 0 && object->n_in_progress == 0)
- fscache_start_operations(object);
- spin_unlock(&object->lock);
- }
-
- if (!list_empty(&object->dependents))
- return transit_to(KILL_DEPENDENTS);
-
- return transit_to(WAIT_FOR_CLEARANCE);
-}
-
-/*
- * Kill dependent objects.
- */
-static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
- return NO_TRANSIT; /* Not finished */
- return transit_to(WAIT_FOR_CLEARANCE);
-}
-
-/*
- * Drop an object's attachments
- */
-static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
- int event)
-{
- struct fscache_object *parent = object->parent;
- struct fscache_cookie *cookie = object->cookie;
- struct fscache_cache *cache = object->cache;
- bool awaken = false;
-
- _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
-
- ASSERT(cookie != NULL);
- ASSERT(!hlist_unhashed(&object->cookie_link));
-
- if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) {
- _debug("final update");
- fscache_update_aux_data(object);
- }
-
- /* Make sure the cookie no longer points here and that the netfs isn't
- * waiting for us.
- */
- spin_lock(&cookie->lock);
- hlist_del_init(&object->cookie_link);
- if (hlist_empty(&cookie->backing_objects) &&
- test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- awaken = true;
- spin_unlock(&cookie->lock);
-
- if (awaken)
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-
-
- /* Prevent a race with our last child, which has to signal EV_CLEARED
- * before dropping our spinlock.
- */
- spin_lock(&object->lock);
- spin_unlock(&object->lock);
-
- /* Discard from the cache's collection of objects */
- spin_lock(&cache->object_list_lock);
- list_del_init(&object->cache_link);
- spin_unlock(&cache->object_list_lock);
-
- fscache_stat(&fscache_n_cop_drop_object);
- cache->ops->drop_object(object);
- fscache_stat_d(&fscache_n_cop_drop_object);
-
- /* The parent object wants to know when all it dependents have gone */
- if (parent) {
- _debug("release parent OBJ%x {%d}",
- parent->debug_id, parent->n_children);
-
- spin_lock(&parent->lock);
- parent->n_children--;
- if (parent->n_children == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
- object->parent = NULL;
- }
-
- /* this just shifts the object release to the work processor */
- fscache_put_object(object, fscache_obj_put_drop_obj);
- fscache_stat(&fscache_n_object_dead);
-
- _leave("");
- return transit_to(OBJECT_DEAD);
-}
-
-/*
- * get a ref on an object
- */
-static int fscache_get_object(struct fscache_object *object,
- enum fscache_obj_ref_trace why)
-{
- int ret;
-
- fscache_stat(&fscache_n_cop_grab_object);
- ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN;
- fscache_stat_d(&fscache_n_cop_grab_object);
- return ret;
-}
-
-/*
- * Discard a ref on an object
- */
-static void fscache_put_object(struct fscache_object *object,
- enum fscache_obj_ref_trace why)
-{
- fscache_stat(&fscache_n_cop_put_object);
- object->cache->ops->put_object(object, why);
- fscache_stat_d(&fscache_n_cop_put_object);
-}
-
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
- */
-void fscache_object_destroy(struct fscache_object *object)
-{
- /* We can get rid of the cookie now */
- fscache_cookie_put(object->cookie, fscache_cookie_put_object);
- object->cookie = NULL;
-}
-EXPORT_SYMBOL(fscache_object_destroy);
-
-/*
- * enqueue an object for metadata-type processing
- */
-void fscache_enqueue_object(struct fscache_object *object)
-{
- _enter("{OBJ%x}", object->debug_id);
-
- if (fscache_get_object(object, fscache_obj_get_queue) >= 0) {
- wait_queue_head_t *cong_wq =
- &get_cpu_var(fscache_object_cong_wait);
-
- if (queue_work(fscache_object_wq, &object->work)) {
- if (fscache_object_congested())
- wake_up(cong_wq);
- } else
- fscache_put_object(object, fscache_obj_put_queue);
-
- put_cpu_var(fscache_object_cong_wait);
- }
-}
-
-/**
- * fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timeoutp: Scheduler sleep timeout
- *
- * Allow an object handler to sleep until the object workqueue is congested.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * %true is returned if the object wq is congested, %false otherwise.
- */
-bool fscache_object_sleep_till_congested(signed long *timeoutp)
-{
- wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait);
- DEFINE_WAIT(wait);
-
- if (fscache_object_congested())
- return true;
-
- add_wait_queue_exclusive(cong_wq, &wait);
- if (!fscache_object_congested())
- *timeoutp = schedule_timeout(*timeoutp);
- finish_wait(cong_wq, &wait);
-
- return fscache_object_congested();
-}
-EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
-
-/*
- * Enqueue the dependents of an object for metadata-type processing.
- *
- * If we don't manage to finish the list before the scheduler wants to run
- * again then return false immediately. We return true if the list was
- * cleared.
- */
-static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
-{
- struct fscache_object *dep;
- bool ret = true;
-
- _enter("{OBJ%x}", object->debug_id);
-
- if (list_empty(&object->dependents))
- return true;
-
- spin_lock(&object->lock);
-
- while (!list_empty(&object->dependents)) {
- dep = list_entry(object->dependents.next,
- struct fscache_object, dep_link);
- list_del_init(&dep->dep_link);
-
- fscache_raise_event(dep, event);
- fscache_put_object(dep, fscache_obj_put_enq_dep);
-
- if (!list_empty(&object->dependents) && need_resched()) {
- ret = false;
- break;
- }
- }
-
- spin_unlock(&object->lock);
- return ret;
-}
-
-/*
- * remove an object from whatever queue it's waiting on
- */
-static void fscache_dequeue_object(struct fscache_object *object)
-{
- _enter("{OBJ%x}", object->debug_id);
-
- if (!list_empty(&object->dep_link)) {
- spin_lock(&object->parent->lock);
- list_del_init(&object->dep_link);
- spin_unlock(&object->parent->lock);
- }
-
- _leave("");
-}
-
-/**
- * fscache_check_aux - Ask the netfs whether an object on disk is still valid
- * @object: The object to ask about
- * @data: The auxiliary data for the object
- * @datalen: The size of the auxiliary data
- * @object_size: The size of the object according to the server.
- *
- * This function consults the netfs about the coherency state of an object.
- * The caller must be holding a ref on cookie->n_active (held by
- * fscache_look_up_object() on behalf of the cache backend during object lookup
- * and creation).
- */
-enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
- const void *data, uint16_t datalen,
- loff_t object_size)
-{
- enum fscache_checkaux result;
-
- if (!object->cookie->def->check_aux) {
- fscache_stat(&fscache_n_checkaux_none);
- return FSCACHE_CHECKAUX_OKAY;
- }
-
- result = object->cookie->def->check_aux(object->cookie->netfs_data,
- data, datalen, object_size);
- switch (result) {
- /* entry okay as is */
- case FSCACHE_CHECKAUX_OKAY:
- fscache_stat(&fscache_n_checkaux_okay);
- break;
-
- /* entry requires update */
- case FSCACHE_CHECKAUX_NEEDS_UPDATE:
- fscache_stat(&fscache_n_checkaux_update);
- break;
-
- /* entry requires deletion */
- case FSCACHE_CHECKAUX_OBSOLETE:
- fscache_stat(&fscache_n_checkaux_obsolete);
- break;
-
- default:
- BUG();
- }
-
- return result;
-}
-EXPORT_SYMBOL(fscache_check_aux);
-
-/*
- * Asynchronously invalidate an object.
- */
-static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
- int event)
-{
- struct fscache_operation *op;
- struct fscache_cookie *cookie = object->cookie;
-
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- /* We're going to need the cookie. If the cookie is not available then
- * retire the object instead.
- */
- if (!fscache_use_cookie(object)) {
- ASSERT(radix_tree_empty(&object->cookie->stores));
- set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
- _leave(" [no cookie]");
- return transit_to(KILL_OBJECT);
- }
-
- /* Reject any new read/write ops and abort any that are pending. */
- fscache_invalidate_writes(cookie);
- clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
- fscache_cancel_all_ops(object);
-
- /* Now we have to wait for in-progress reads and writes */
- op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op)
- goto nomem;
-
- fscache_operation_init(cookie, op, object->cache->ops->invalidate_object,
- NULL, NULL);
- op->flags = FSCACHE_OP_ASYNC |
- (1 << FSCACHE_OP_EXCLUSIVE) |
- (1 << FSCACHE_OP_UNUSE_COOKIE);
- trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate);
-
- spin_lock(&cookie->lock);
- if (fscache_submit_exclusive_op(object, op) < 0)
- goto submit_op_failed;
- spin_unlock(&cookie->lock);
- fscache_put_operation(op);
-
- /* Once we've completed the invalidation, we know there will be no data
- * stored in the cache and thus we can reinstate the data-check-skip
- * optimisation.
- */
- set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
-
- /* We can allow read and write requests to come in once again. They'll
- * queue up behind our exclusive invalidation operation.
- */
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
- _leave(" [ok]");
- return transit_to(UPDATE_OBJECT);
-
-nomem:
- fscache_mark_object_dead(object);
- fscache_unuse_cookie(object);
- _leave(" [ENOMEM]");
- return transit_to(KILL_OBJECT);
-
-submit_op_failed:
- fscache_mark_object_dead(object);
- spin_unlock(&cookie->lock);
- fscache_unuse_cookie(object);
- kfree(op);
- _leave(" [EIO]");
- return transit_to(KILL_OBJECT);
-}
-
-static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
- int event)
-{
- const struct fscache_state *s;
-
- fscache_stat(&fscache_n_invalidates_run);
- fscache_stat(&fscache_n_cop_invalidate_object);
- s = _fscache_invalidate_object(object, event);
- fscache_stat_d(&fscache_n_cop_invalidate_object);
- return s;
-}
-
-/*
- * Update auxiliary data.
- */
-static void fscache_update_aux_data(struct fscache_object *object)
-{
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
-}
-
-/*
- * Asynchronously update an object.
- */
-static const struct fscache_state *fscache_update_object(struct fscache_object *object,
- int event)
-{
- _enter("{OBJ%x},%d", object->debug_id, event);
-
- fscache_update_aux_data(object);
-
- _leave("");
- return transit_to(WAIT_FOR_CMD);
-}
-
-/**
- * fscache_object_retrying_stale - Note retrying stale object
- * @object: The object that will be retried
- *
- * Note that an object lookup found an on-disk object that was adjudged to be
- * stale and has been deleted. The lookup will be retried.
- */
-void fscache_object_retrying_stale(struct fscache_object *object)
-{
- fscache_stat(&fscache_n_cache_no_space_reject);
-}
-EXPORT_SYMBOL(fscache_object_retrying_stale);
-
-/**
- * fscache_object_mark_killed - Note that an object was killed
- * @object: The object that was culled
- * @why: The reason the object was killed.
- *
- * Note that an object was killed. Returns true if the object was
- * already marked killed, false if it wasn't.
- */
-void fscache_object_mark_killed(struct fscache_object *object,
- enum fscache_why_object_killed why)
-{
- if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) {
- pr_err("Error: Object already killed by cache [%s]\n",
- object->cache->identifier);
- return;
- }
-
- switch (why) {
- case FSCACHE_OBJECT_NO_SPACE:
- fscache_stat(&fscache_n_cache_no_space_reject);
- break;
- case FSCACHE_OBJECT_IS_STALE:
- fscache_stat(&fscache_n_cache_stale_objects);
- break;
- case FSCACHE_OBJECT_WAS_RETIRED:
- fscache_stat(&fscache_n_cache_retired_objects);
- break;
- case FSCACHE_OBJECT_WAS_CULLED:
- fscache_stat(&fscache_n_cache_culled_objects);
- break;
- }
-}
-EXPORT_SYMBOL(fscache_object_mark_killed);
-
-/*
- * The object is dead. We can get here if an object gets queued by an event
- * that would lead to its death (such as EV_KILL) when the dispatcher is
- * already running (and so can be requeued) but hasn't yet cleared the event
- * mask.
- */
-static const struct fscache_state *fscache_object_dead(struct fscache_object *object,
- int event)
-{
- if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD,
- &object->flags))
- return NO_TRANSIT;
-
- WARN(true, "FS-Cache object redispatched after death");
- return NO_TRANSIT;
-}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
deleted file mode 100644
index e002cdfaf3cc..000000000000
--- a/fs/fscache/operation.c
+++ /dev/null
@@ -1,633 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* FS-Cache worker operation management routines
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * See Documentation/filesystems/caching/operations.rst
- */
-
-#define FSCACHE_DEBUG_LEVEL OPERATION
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include "internal.h"
-
-atomic_t fscache_op_debug_id;
-EXPORT_SYMBOL(fscache_op_debug_id);
-
-static void fscache_operation_dummy_cancel(struct fscache_operation *op)
-{
-}
-
-/**
- * fscache_operation_init - Do basic initialisation of an operation
- * @cookie: The cookie to operate on
- * @op: The operation to initialise
- * @processor: The function to perform the operation
- * @cancel: A function to handle operation cancellation
- * @release: The release function to assign
- *
- * Do basic initialisation of an operation. The caller must still set flags,
- * object and processor if needed.
- */
-void fscache_operation_init(struct fscache_cookie *cookie,
- struct fscache_operation *op,
- fscache_operation_processor_t processor,
- fscache_operation_cancel_t cancel,
- fscache_operation_release_t release)
-{
- INIT_WORK(&op->work, fscache_op_work_func);
- atomic_set(&op->usage, 1);
- op->state = FSCACHE_OP_ST_INITIALISED;
- op->debug_id = atomic_inc_return(&fscache_op_debug_id);
- op->processor = processor;
- op->cancel = cancel ?: fscache_operation_dummy_cancel;
- op->release = release;
- INIT_LIST_HEAD(&op->pend_link);
- fscache_stat(&fscache_n_op_initialised);
- trace_fscache_op(cookie, op, fscache_op_init);
-}
-EXPORT_SYMBOL(fscache_operation_init);
-
-/**
- * fscache_enqueue_operation - Enqueue an operation for processing
- * @op: The operation to enqueue
- *
- * Enqueue an operation for processing by the FS-Cache thread pool.
- *
- * This will get its own ref on the object.
- */
-void fscache_enqueue_operation(struct fscache_operation *op)
-{
- struct fscache_cookie *cookie = op->object->cookie;
-
- _enter("{OBJ%x OP%x,%u}",
- op->object->debug_id, op->debug_id, atomic_read(&op->usage));
-
- ASSERT(list_empty(&op->pend_link));
- ASSERT(op->processor != NULL);
- ASSERT(fscache_object_is_available(op->object));
- ASSERTCMP(atomic_read(&op->usage), >, 0);
- ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS,
- op->state, ==, FSCACHE_OP_ST_CANCELLED);
-
- fscache_stat(&fscache_n_op_enqueue);
- switch (op->flags & FSCACHE_OP_TYPE) {
- case FSCACHE_OP_ASYNC:
- trace_fscache_op(cookie, op, fscache_op_enqueue_async);
- _debug("queue async");
- atomic_inc(&op->usage);
- if (!queue_work(fscache_op_wq, &op->work))
- fscache_put_operation(op);
- break;
- case FSCACHE_OP_MYTHREAD:
- trace_fscache_op(cookie, op, fscache_op_enqueue_mythread);
- _debug("queue for caller's attention");
- break;
- default:
- pr_err("Unexpected op type %lx", op->flags);
- BUG();
- break;
- }
-}
-EXPORT_SYMBOL(fscache_enqueue_operation);
-
-/*
- * start an op running
- */
-static void fscache_run_op(struct fscache_object *object,
- struct fscache_operation *op)
-{
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
-
- op->state = FSCACHE_OP_ST_IN_PROGRESS;
- object->n_in_progress++;
- if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
- wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- if (op->processor)
- fscache_enqueue_operation(op);
- else
- trace_fscache_op(object->cookie, op, fscache_op_run);
- fscache_stat(&fscache_n_op_run);
-}
-
-/*
- * report an unexpected submission
- */
-static void fscache_report_unexpected_submission(struct fscache_object *object,
- struct fscache_operation *op,
- const struct fscache_state *ostate)
-{
- static bool once_only;
- struct fscache_operation *p;
- unsigned n;
-
- if (once_only)
- return;
- once_only = true;
-
- kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id, object->state->name);
- kdebug("objstate=%s [%s]", object->state->name, ostate->name);
- kdebug("objflags=%lx", object->flags);
- kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
- kdebug("ops=%u inp=%u exc=%u",
- object->n_ops, object->n_in_progress, object->n_exclusive);
-
- if (!list_empty(&object->pending_ops)) {
- n = 0;
- list_for_each_entry(p, &object->pending_ops, pend_link) {
- ASSERTCMP(p->object, ==, object);
- kdebug("%p %p", op->processor, op->release);
- n++;
- }
-
- kdebug("n=%u", n);
- }
-
- dump_stack();
-}
-
-/*
- * submit an exclusive operation for an object
- * - other ops are excluded from running simultaneously with this one
- * - this gets any extra refs it needs on an op
- */
-int fscache_submit_exclusive_op(struct fscache_object *object,
- struct fscache_operation *op)
-{
- const struct fscache_state *ostate;
- unsigned long flags;
- int ret;
-
- _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
-
- trace_fscache_op(object->cookie, op, fscache_op_submit_ex);
-
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
- ASSERTCMP(atomic_read(&op->usage), >, 0);
-
- spin_lock(&object->lock);
- ASSERTCMP(object->n_ops, >=, object->n_in_progress);
- ASSERTCMP(object->n_ops, >=, object->n_exclusive);
- ASSERT(list_empty(&op->pend_link));
-
- ostate = object->state;
- smp_rmb();
-
- op->state = FSCACHE_OP_ST_PENDING;
- flags = READ_ONCE(object->flags);
- if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
- fscache_stat(&fscache_n_op_rejected);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- } else if (unlikely(fscache_cache_is_broken(object))) {
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -EIO;
- } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
- op->object = object;
- object->n_ops++;
- object->n_exclusive++; /* reads and writes must wait */
-
- if (object->n_in_progress > 0) {
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- } else if (!list_empty(&object->pending_ops)) {
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- fscache_start_operations(object);
- } else {
- ASSERTCMP(object->n_in_progress, ==, 0);
- fscache_run_op(object, op);
- }
-
- /* need to issue a new write op after this */
- clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
- ret = 0;
- } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
- op->object = object;
- object->n_ops++;
- object->n_exclusive++; /* reads and writes must wait */
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- ret = 0;
- } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- } else {
- fscache_report_unexpected_submission(object, op, ostate);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- }
-
- spin_unlock(&object->lock);
- return ret;
-}
-
-/*
- * submit an operation for an object
- * - objects may be submitted only in the following states:
- * - during object creation (write ops may be submitted)
- * - whilst the object is active
- * - after an I/O error incurred in one of the two above states (op rejected)
- * - this gets any extra refs it needs on an op
- */
-int fscache_submit_op(struct fscache_object *object,
- struct fscache_operation *op)
-{
- const struct fscache_state *ostate;
- unsigned long flags;
- int ret;
-
- _enter("{OBJ%x OP%x},{%u}",
- object->debug_id, op->debug_id, atomic_read(&op->usage));
-
- trace_fscache_op(object->cookie, op, fscache_op_submit);
-
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
- ASSERTCMP(atomic_read(&op->usage), >, 0);
-
- spin_lock(&object->lock);
- ASSERTCMP(object->n_ops, >=, object->n_in_progress);
- ASSERTCMP(object->n_ops, >=, object->n_exclusive);
- ASSERT(list_empty(&op->pend_link));
-
- ostate = object->state;
- smp_rmb();
-
- op->state = FSCACHE_OP_ST_PENDING;
- flags = READ_ONCE(object->flags);
- if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
- fscache_stat(&fscache_n_op_rejected);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- } else if (unlikely(fscache_cache_is_broken(object))) {
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -EIO;
- } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
- op->object = object;
- object->n_ops++;
-
- if (object->n_exclusive > 0) {
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- } else if (!list_empty(&object->pending_ops)) {
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- fscache_start_operations(object);
- } else {
- ASSERTCMP(object->n_exclusive, ==, 0);
- fscache_run_op(object, op);
- }
- ret = 0;
- } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
- op->object = object;
- object->n_ops++;
- atomic_inc(&op->usage);
- list_add_tail(&op->pend_link, &object->pending_ops);
- fscache_stat(&fscache_n_op_pend);
- ret = 0;
- } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- } else {
- fscache_report_unexpected_submission(object, op, ostate);
- ASSERT(!fscache_object_is_active(object));
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- }
-
- spin_unlock(&object->lock);
- return ret;
-}
-
-/*
- * queue an object for withdrawal on error, aborting all following asynchronous
- * operations
- */
-void fscache_abort_object(struct fscache_object *object)
-{
- _enter("{OBJ%x}", object->debug_id);
-
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
-}
-
-/*
- * Jump start the operation processing on an object. The caller must hold
- * object->lock.
- */
-void fscache_start_operations(struct fscache_object *object)
-{
- struct fscache_operation *op;
- bool stop = false;
-
- while (!list_empty(&object->pending_ops) && !stop) {
- op = list_entry(object->pending_ops.next,
- struct fscache_operation, pend_link);
-
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
- if (object->n_in_progress > 0)
- break;
- stop = true;
- }
- list_del_init(&op->pend_link);
- fscache_run_op(object, op);
-
- /* the pending queue was holding a ref on the object */
- fscache_put_operation(op);
- }
-
- ASSERTCMP(object->n_in_progress, <=, object->n_ops);
-
- _debug("woke %d ops on OBJ%x",
- object->n_in_progress, object->debug_id);
-}
-
-/*
- * cancel an operation that's pending on an object
- */
-int fscache_cancel_op(struct fscache_operation *op,
- bool cancel_in_progress_op)
-{
- struct fscache_object *object = op->object;
- bool put = false;
- int ret;
-
- _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
-
- trace_fscache_op(object->cookie, op, fscache_op_cancel);
-
- ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
- ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
- ASSERTCMP(atomic_read(&op->usage), >, 0);
-
- spin_lock(&object->lock);
-
- ret = -EBUSY;
- if (op->state == FSCACHE_OP_ST_PENDING) {
- ASSERT(!list_empty(&op->pend_link));
- list_del_init(&op->pend_link);
- put = true;
-
- fscache_stat(&fscache_n_op_cancelled);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
- object->n_exclusive--;
- if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
- wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- ret = 0;
- } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) {
- ASSERTCMP(object->n_in_progress, >, 0);
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
- object->n_exclusive--;
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
-
- fscache_stat(&fscache_n_op_cancelled);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
- object->n_exclusive--;
- if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
- wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- ret = 0;
- }
-
- if (put)
- fscache_put_operation(op);
- spin_unlock(&object->lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * Cancel all pending operations on an object
- */
-void fscache_cancel_all_ops(struct fscache_object *object)
-{
- struct fscache_operation *op;
-
- _enter("OBJ%x", object->debug_id);
-
- spin_lock(&object->lock);
-
- while (!list_empty(&object->pending_ops)) {
- op = list_entry(object->pending_ops.next,
- struct fscache_operation, pend_link);
- fscache_stat(&fscache_n_op_cancelled);
- list_del_init(&op->pend_link);
-
- trace_fscache_op(object->cookie, op, fscache_op_cancel_all);
-
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
- op->cancel(op);
- op->state = FSCACHE_OP_ST_CANCELLED;
-
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
- object->n_exclusive--;
- if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
- wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- fscache_put_operation(op);
- cond_resched_lock(&object->lock);
- }
-
- spin_unlock(&object->lock);
- _leave("");
-}
-
-/*
- * Record the completion or cancellation of an in-progress operation.
- */
-void fscache_op_complete(struct fscache_operation *op, bool cancelled)
-{
- struct fscache_object *object = op->object;
-
- _enter("OBJ%x", object->debug_id);
-
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
- ASSERTCMP(object->n_in_progress, >, 0);
- ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
- object->n_exclusive, >, 0);
- ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
- object->n_in_progress, ==, 1);
-
- spin_lock(&object->lock);
-
- if (!cancelled) {
- trace_fscache_op(object->cookie, op, fscache_op_completed);
- op->state = FSCACHE_OP_ST_COMPLETE;
- } else {
- op->cancel(op);
- trace_fscache_op(object->cookie, op, fscache_op_cancelled);
- op->state = FSCACHE_OP_ST_CANCELLED;
- }
-
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
- object->n_exclusive--;
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
-
- spin_unlock(&object->lock);
- _leave("");
-}
-EXPORT_SYMBOL(fscache_op_complete);
-
-/*
- * release an operation
- * - queues pending ops if this is the last in-progress op
- */
-void fscache_put_operation(struct fscache_operation *op)
-{
- struct fscache_object *object;
- struct fscache_cache *cache;
-
- _enter("{OBJ%x OP%x,%d}",
- op->object ? op->object->debug_id : 0,
- op->debug_id, atomic_read(&op->usage));
-
- ASSERTCMP(atomic_read(&op->usage), >, 0);
-
- if (!atomic_dec_and_test(&op->usage))
- return;
-
- trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put);
-
- _debug("PUT OP");
- ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
- op->state != FSCACHE_OP_ST_COMPLETE,
- op->state, ==, FSCACHE_OP_ST_CANCELLED);
-
- fscache_stat(&fscache_n_op_release);
-
- if (op->release) {
- op->release(op);
- op->release = NULL;
- }
- op->state = FSCACHE_OP_ST_DEAD;
-
- object = op->object;
- if (likely(object)) {
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
- atomic_dec(&object->n_reads);
- if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
- fscache_unuse_cookie(object);
-
- /* now... we may get called with the object spinlock held, so we
- * complete the cleanup here only if we can immediately acquire the
- * lock, and defer it otherwise */
- if (!spin_trylock(&object->lock)) {
- _debug("defer put");
- fscache_stat(&fscache_n_op_deferred_release);
-
- cache = object->cache;
- spin_lock(&cache->op_gc_list_lock);
- list_add_tail(&op->pend_link, &cache->op_gc_list);
- spin_unlock(&cache->op_gc_list_lock);
- schedule_work(&cache->op_gc);
- _leave(" [defer]");
- return;
- }
-
- ASSERTCMP(object->n_ops, >, 0);
- object->n_ops--;
- if (object->n_ops == 0)
- fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
-
- spin_unlock(&object->lock);
- }
-
- kfree(op);
- _leave(" [done]");
-}
-EXPORT_SYMBOL(fscache_put_operation);
-
-/*
- * garbage collect operations that have had their release deferred
- */
-void fscache_operation_gc(struct work_struct *work)
-{
- struct fscache_operation *op;
- struct fscache_object *object;
- struct fscache_cache *cache =
- container_of(work, struct fscache_cache, op_gc);
- int count = 0;
-
- _enter("");
-
- do {
- spin_lock(&cache->op_gc_list_lock);
- if (list_empty(&cache->op_gc_list)) {
- spin_unlock(&cache->op_gc_list_lock);
- break;
- }
-
- op = list_entry(cache->op_gc_list.next,
- struct fscache_operation, pend_link);
- list_del(&op->pend_link);
- spin_unlock(&cache->op_gc_list_lock);
-
- object = op->object;
- trace_fscache_op(object->cookie, op, fscache_op_gc);
-
- spin_lock(&object->lock);
-
- _debug("GC DEFERRED REL OBJ%x OP%x",
- object->debug_id, op->debug_id);
- fscache_stat(&fscache_n_op_gc);
-
- ASSERTCMP(atomic_read(&op->usage), ==, 0);
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
-
- ASSERTCMP(object->n_ops, >, 0);
- object->n_ops--;
- if (object->n_ops == 0)
- fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
-
- spin_unlock(&object->lock);
- kfree(op);
-
- } while (count++ < 20);
-
- if (!list_empty(&cache->op_gc_list))
- schedule_work(&cache->op_gc);
-
- _leave("");
-}
-
-/*
- * execute an operation using fs_op_wq to provide processing context -
- * the caller holds a ref to this object, so we don't need to hold one
- */
-void fscache_op_work_func(struct work_struct *work)
-{
- struct fscache_operation *op =
- container_of(work, struct fscache_operation, work);
-
- _enter("{OBJ%x OP%x,%d}",
- op->object->debug_id, op->debug_id, atomic_read(&op->usage));
-
- trace_fscache_op(op->object->cookie, op, fscache_op_work);
-
- ASSERT(op->processor != NULL);
- op->processor(op);
- fscache_put_operation(op);
-
- _leave("");
-}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
deleted file mode 100644
index 27df94ef0e0b..000000000000
--- a/fs/fscache/page.c
+++ /dev/null
@@ -1,1242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Cache page management and data I/O routines
- *
- * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#define FSCACHE_DEBUG_LEVEL PAGE
-#include <linux/module.h>
-#include <linux/fscache-cache.h>
-#include <linux/buffer_head.h>
-#include <linux/pagevec.h>
-#include <linux/slab.h>
-#include "internal.h"
-
-/*
- * check to see if a page is being written to the cache
- */
-bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
-{
- void *val;
-
- rcu_read_lock();
- val = radix_tree_lookup(&cookie->stores, page->index);
- rcu_read_unlock();
- trace_fscache_check_page(cookie, page, val, 0);
-
- return val != NULL;
-}
-EXPORT_SYMBOL(__fscache_check_page_write);
-
-/*
- * wait for a page to finish being written to the cache
- */
-void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
-{
- wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
-
- trace_fscache_page(cookie, page, fscache_page_write_wait);
-
- wait_event(*wq, !__fscache_check_page_write(cookie, page));
-}
-EXPORT_SYMBOL(__fscache_wait_on_page_write);
-
-/*
- * wait for a page to finish being written to the cache. Put a timeout here
- * since we might be called recursively via parent fs.
- */
-static
-bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
-{
- wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
-
- return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
- HZ);
-}
-
-/*
- * decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
- */
-bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
- struct page *page,
- gfp_t gfp)
-{
- struct page *xpage;
- void *val;
-
- _enter("%p,%p,%x", cookie, page, gfp);
-
- trace_fscache_page(cookie, page, fscache_page_maybe_release);
-
-try_again:
- rcu_read_lock();
- val = radix_tree_lookup(&cookie->stores, page->index);
- if (!val) {
- rcu_read_unlock();
- fscache_stat(&fscache_n_store_vmscan_not_storing);
- __fscache_uncache_page(cookie, page);
- return true;
- }
-
- /* see if the page is actually undergoing storage - if so we can't get
- * rid of it till the cache has finished with it */
- if (radix_tree_tag_get(&cookie->stores, page->index,
- FSCACHE_COOKIE_STORING_TAG)) {
- rcu_read_unlock();
- goto page_busy;
- }
-
- /* the page is pending storage, so we attempt to cancel the store and
- * discard the store request so that the page can be reclaimed */
- spin_lock(&cookie->stores_lock);
- rcu_read_unlock();
-
- if (radix_tree_tag_get(&cookie->stores, page->index,
- FSCACHE_COOKIE_STORING_TAG)) {
- /* the page started to undergo storage whilst we were looking,
- * so now we can only wait or return */
- spin_unlock(&cookie->stores_lock);
- goto page_busy;
- }
-
- xpage = radix_tree_delete(&cookie->stores, page->index);
- trace_fscache_page(cookie, page, fscache_page_radix_delete);
- spin_unlock(&cookie->stores_lock);
-
- if (xpage) {
- fscache_stat(&fscache_n_store_vmscan_cancelled);
- fscache_stat(&fscache_n_store_radix_deletes);
- ASSERTCMP(xpage, ==, page);
- } else {
- fscache_stat(&fscache_n_store_vmscan_gone);
- }
-
- wake_up_bit(&cookie->flags, 0);
- trace_fscache_wake_cookie(cookie);
- if (xpage)
- put_page(xpage);
- __fscache_uncache_page(cookie, page);
- return true;
-
-page_busy:
- /* We will wait here if we're allowed to, but that could deadlock the
- * allocator as the work threads writing to the cache may all end up
- * sleeping on memory allocation, so we may need to impose a timeout
- * too. */
- if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
- fscache_stat(&fscache_n_store_vmscan_busy);
- return false;
- }
-
- fscache_stat(&fscache_n_store_vmscan_wait);
- if (!release_page_wait_timeout(cookie, page))
- _debug("fscache writeout timeout page: %p{%lx}",
- page, page->index);
-
- gfp &= ~__GFP_DIRECT_RECLAIM;
- goto try_again;
-}
-EXPORT_SYMBOL(__fscache_maybe_release_page);
-
-/*
- * note that a page has finished being written to the cache
- */
-static void fscache_end_page_write(struct fscache_object *object,
- struct page *page)
-{
- struct fscache_cookie *cookie;
- struct page *xpage = NULL, *val;
-
- spin_lock(&object->lock);
- cookie = object->cookie;
- if (cookie) {
- /* delete the page from the tree if it is now no longer
- * pending */
- spin_lock(&cookie->stores_lock);
- radix_tree_tag_clear(&cookie->stores, page->index,
- FSCACHE_COOKIE_STORING_TAG);
- trace_fscache_page(cookie, page, fscache_page_radix_clear_store);
- if (!radix_tree_tag_get(&cookie->stores, page->index,
- FSCACHE_COOKIE_PENDING_TAG)) {
- fscache_stat(&fscache_n_store_radix_deletes);
- xpage = radix_tree_delete(&cookie->stores, page->index);
- trace_fscache_page(cookie, page, fscache_page_radix_delete);
- trace_fscache_page(cookie, page, fscache_page_write_end);
-
- val = radix_tree_lookup(&cookie->stores, page->index);
- trace_fscache_check_page(cookie, page, val, 1);
- } else {
- trace_fscache_page(cookie, page, fscache_page_write_end_pend);
- }
- spin_unlock(&cookie->stores_lock);
- wake_up_bit(&cookie->flags, 0);
- trace_fscache_wake_cookie(cookie);
- } else {
- trace_fscache_page(cookie, page, fscache_page_write_end_noc);
- }
- spin_unlock(&object->lock);
- if (xpage)
- put_page(xpage);
-}
-
-/*
- * actually apply the changed attributes to a cache object
- */
-static void fscache_attr_changed_op(struct fscache_operation *op)
-{
- struct fscache_object *object = op->object;
- int ret;
-
- _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
-
- fscache_stat(&fscache_n_attr_changed_calls);
-
- if (fscache_object_is_active(object)) {
- fscache_stat(&fscache_n_cop_attr_changed);
- ret = object->cache->ops->attr_changed(object);
- fscache_stat_d(&fscache_n_cop_attr_changed);
- if (ret < 0)
- fscache_abort_object(object);
- fscache_op_complete(op, ret < 0);
- } else {
- fscache_op_complete(op, true);
- }
-
- _leave("");
-}
-
-/*
- * notification that the attributes on an object have changed
- */
-int __fscache_attr_changed(struct fscache_cookie *cookie)
-{
- struct fscache_operation *op;
- struct fscache_object *object;
- bool wake_cookie = false;
-
- _enter("%p", cookie);
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
-
- fscache_stat(&fscache_n_attr_changed);
-
- op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- fscache_stat(&fscache_n_attr_changed_nomem);
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
-
- fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL);
- trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed);
- op->flags = FSCACHE_OP_ASYNC |
- (1 << FSCACHE_OP_EXCLUSIVE) |
- (1 << FSCACHE_OP_UNUSE_COOKIE);
-
- spin_lock(&cookie->lock);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- __fscache_use_cookie(cookie);
- if (fscache_submit_exclusive_op(object, op) < 0)
- goto nobufs_dec;
- spin_unlock(&cookie->lock);
- fscache_stat(&fscache_n_attr_changed_ok);
- fscache_put_operation(op);
- _leave(" = 0");
- return 0;
-
-nobufs_dec:
- wake_cookie = __fscache_unuse_cookie(cookie);
-nobufs:
- spin_unlock(&cookie->lock);
- fscache_put_operation(op);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
- fscache_stat(&fscache_n_attr_changed_nobufs);
- _leave(" = %d", -ENOBUFS);
- return -ENOBUFS;
-}
-EXPORT_SYMBOL(__fscache_attr_changed);
-
-/*
- * Handle cancellation of a pending retrieval op
- */
-static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
-{
- struct fscache_retrieval *op =
- container_of(_op, struct fscache_retrieval, op);
-
- atomic_set(&op->n_pages, 0);
-}
-
-/*
- * release a retrieval op reference
- */
-static void fscache_release_retrieval_op(struct fscache_operation *_op)
-{
- struct fscache_retrieval *op =
- container_of(_op, struct fscache_retrieval, op);
-
- _enter("{OP%x}", op->op.debug_id);
-
- ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
- atomic_read(&op->n_pages), ==, 0);
-
- if (op->context)
- fscache_put_context(op->cookie, op->context);
-
- _leave("");
-}
-
-/*
- * allocate a retrieval op
- */
-struct fscache_retrieval *fscache_alloc_retrieval(
- struct fscache_cookie *cookie,
- struct address_space *mapping,
- fscache_rw_complete_t end_io_func,
- void *context)
-{
- struct fscache_retrieval *op;
-
- /* allocate a retrieval operation and attempt to submit it */
- op = kzalloc(sizeof(*op), GFP_NOIO);
- if (!op) {
- fscache_stat(&fscache_n_retrievals_nomem);
- return NULL;
- }
-
- fscache_operation_init(cookie, &op->op, NULL,
- fscache_do_cancel_retrieval,
- fscache_release_retrieval_op);
- op->op.flags = FSCACHE_OP_MYTHREAD |
- (1UL << FSCACHE_OP_WAITING) |
- (1UL << FSCACHE_OP_UNUSE_COOKIE);
- op->cookie = cookie;
- op->mapping = mapping;
- op->end_io_func = end_io_func;
- op->context = context;
- INIT_LIST_HEAD(&op->to_do);
-
- /* Pin the netfs read context in case we need to do the actual netfs
- * read because we've encountered a cache read failure.
- */
- if (context)
- fscache_get_context(op->cookie, context);
- return op;
-}
-
-/*
- * wait for a deferred lookup to complete
- */
-int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
-{
- _enter("");
-
- if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
- _leave(" = 0 [imm]");
- return 0;
- }
-
- fscache_stat(&fscache_n_retrievals_wait);
-
- if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
- TASK_INTERRUPTIBLE) != 0) {
- fscache_stat(&fscache_n_retrievals_intr);
- _leave(" = -ERESTARTSYS");
- return -ERESTARTSYS;
- }
-
- ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
-
- smp_rmb();
- _leave(" = 0 [dly]");
- return 0;
-}
-
-/*
- * wait for an object to become active (or dead)
- */
-int fscache_wait_for_operation_activation(struct fscache_object *object,
- struct fscache_operation *op,
- atomic_t *stat_op_waits,
- atomic_t *stat_object_dead)
-{
- int ret;
-
- if (!test_bit(FSCACHE_OP_WAITING, &op->flags))
- goto check_if_dead;
-
- _debug(">>> WT");
- if (stat_op_waits)
- fscache_stat(stat_op_waits);
- if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
- TASK_INTERRUPTIBLE) != 0) {
- trace_fscache_op(object->cookie, op, fscache_op_signal);
- ret = fscache_cancel_op(op, false);
- if (ret == 0)
- return -ERESTARTSYS;
-
- /* it's been removed from the pending queue by another party,
- * so we should get to run shortly */
- wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
- TASK_UNINTERRUPTIBLE);
- }
- _debug("<<< GO");
-
-check_if_dead:
- if (op->state == FSCACHE_OP_ST_CANCELLED) {
- if (stat_object_dead)
- fscache_stat(stat_object_dead);
- _leave(" = -ENOBUFS [cancelled]");
- return -ENOBUFS;
- }
- if (unlikely(fscache_object_is_dying(object) ||
- fscache_cache_is_broken(object))) {
- enum fscache_operation_state state = op->state;
- trace_fscache_op(object->cookie, op, fscache_op_signal);
- fscache_cancel_op(op, true);
- if (stat_object_dead)
- fscache_stat(stat_object_dead);
- _leave(" = -ENOBUFS [obj dead %d]", state);
- return -ENOBUFS;
- }
- return 0;
-}
-
-/*
- * read a page from the cache or allocate a block in which to store it
- * - we return:
- * -ENOMEM - out of memory, nothing done
- * -ERESTARTSYS - interrupted
- * -ENOBUFS - no backing object available in which to cache the block
- * -ENODATA - no data available in the backing object for this block
- * 0 - dispatched a read - it'll call end_io_func() when finished
- */
-int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
- struct page *page,
- fscache_rw_complete_t end_io_func,
- void *context,
- gfp_t gfp)
-{
- struct fscache_retrieval *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
-
- _enter("%p,%p,,,", cookie, page);
-
- fscache_stat(&fscache_n_retrievals);
-
- if (hlist_empty(&cookie->backing_objects))
- goto nobufs;
-
- if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
- _leave(" = -ENOBUFS [invalidating]");
- return -ENOBUFS;
- }
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
- ASSERTCMP(page, !=, NULL);
-
- if (fscache_wait_for_deferred_lookup(cookie) < 0)
- return -ERESTARTSYS;
-
- op = fscache_alloc_retrieval(cookie, page->mapping,
- end_io_func, context);
- if (!op) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
- atomic_set(&op->n_pages, 1);
- trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one);
-
- spin_lock(&cookie->lock);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs_unlock;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
-
- __fscache_use_cookie(cookie);
- atomic_inc(&object->n_reads);
- __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
-
- if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock_dec;
- spin_unlock(&cookie->lock);
-
- fscache_stat(&fscache_n_retrieval_ops);
-
- /* we wait for the operation to become active, and then process it
- * *here*, in this thread, and not in the thread pool */
- ret = fscache_wait_for_operation_activation(
- object, &op->op,
- __fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead));
- if (ret < 0)
- goto error;
-
- /* ask the cache to honour the operation */
- if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
- fscache_stat(&fscache_n_cop_allocate_page);
- ret = object->cache->ops->allocate_page(op, page, gfp);
- fscache_stat_d(&fscache_n_cop_allocate_page);
- if (ret == 0)
- ret = -ENODATA;
- } else {
- fscache_stat(&fscache_n_cop_read_or_alloc_page);
- ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
- fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
- }
-
-error:
- if (ret == -ENOMEM)
- fscache_stat(&fscache_n_retrievals_nomem);
- else if (ret == -ERESTARTSYS)
- fscache_stat(&fscache_n_retrievals_intr);
- else if (ret == -ENODATA)
- fscache_stat(&fscache_n_retrievals_nodata);
- else if (ret < 0)
- fscache_stat(&fscache_n_retrievals_nobufs);
- else
- fscache_stat(&fscache_n_retrievals_ok);
-
- fscache_put_retrieval(op);
- _leave(" = %d", ret);
- return ret;
-
-nobufs_unlock_dec:
- atomic_dec(&object->n_reads);
- wake_cookie = __fscache_unuse_cookie(cookie);
-nobufs_unlock:
- spin_unlock(&cookie->lock);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
- fscache_put_retrieval(op);
-nobufs:
- fscache_stat(&fscache_n_retrievals_nobufs);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
-}
-EXPORT_SYMBOL(__fscache_read_or_alloc_page);
-
-/*
- * read a list of page from the cache or allocate a block in which to store
- * them
- * - we return:
- * -ENOMEM - out of memory, some pages may be being read
- * -ERESTARTSYS - interrupted, some pages may be being read
- * -ENOBUFS - no backing object or space available in which to cache any
- * pages not being read
- * -ENODATA - no data available in the backing object for some or all of
- * the pages
- * 0 - dispatched a read on all pages
- *
- * end_io_func() will be called for each page read from the cache as it is
- * finishes being read
- *
- * any pages for which a read is dispatched will be removed from pages and
- * nr_pages
- */
-int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages,
- fscache_rw_complete_t end_io_func,
- void *context,
- gfp_t gfp)
-{
- struct fscache_retrieval *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
-
- _enter("%p,,%d,,,", cookie, *nr_pages);
-
- fscache_stat(&fscache_n_retrievals);
-
- if (hlist_empty(&cookie->backing_objects))
- goto nobufs;
-
- if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
- _leave(" = -ENOBUFS [invalidating]");
- return -ENOBUFS;
- }
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
- ASSERTCMP(*nr_pages, >, 0);
- ASSERT(!list_empty(pages));
-
- if (fscache_wait_for_deferred_lookup(cookie) < 0)
- return -ERESTARTSYS;
-
- op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
- if (!op)
- return -ENOMEM;
- atomic_set(&op->n_pages, *nr_pages);
- trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
-
- spin_lock(&cookie->lock);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs_unlock;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- __fscache_use_cookie(cookie);
- atomic_inc(&object->n_reads);
- __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
-
- if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock_dec;
- spin_unlock(&cookie->lock);
-
- fscache_stat(&fscache_n_retrieval_ops);
-
- /* we wait for the operation to become active, and then process it
- * *here*, in this thread, and not in the thread pool */
- ret = fscache_wait_for_operation_activation(
- object, &op->op,
- __fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead));
- if (ret < 0)
- goto error;
-
- /* ask the cache to honour the operation */
- if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
- fscache_stat(&fscache_n_cop_allocate_pages);
- ret = object->cache->ops->allocate_pages(
- op, pages, nr_pages, gfp);
- fscache_stat_d(&fscache_n_cop_allocate_pages);
- } else {
- fscache_stat(&fscache_n_cop_read_or_alloc_pages);
- ret = object->cache->ops->read_or_alloc_pages(
- op, pages, nr_pages, gfp);
- fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
- }
-
-error:
- if (ret == -ENOMEM)
- fscache_stat(&fscache_n_retrievals_nomem);
- else if (ret == -ERESTARTSYS)
- fscache_stat(&fscache_n_retrievals_intr);
- else if (ret == -ENODATA)
- fscache_stat(&fscache_n_retrievals_nodata);
- else if (ret < 0)
- fscache_stat(&fscache_n_retrievals_nobufs);
- else
- fscache_stat(&fscache_n_retrievals_ok);
-
- fscache_put_retrieval(op);
- _leave(" = %d", ret);
- return ret;
-
-nobufs_unlock_dec:
- atomic_dec(&object->n_reads);
- wake_cookie = __fscache_unuse_cookie(cookie);
-nobufs_unlock:
- spin_unlock(&cookie->lock);
- fscache_put_retrieval(op);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
-nobufs:
- fscache_stat(&fscache_n_retrievals_nobufs);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
-}
-EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
-
-/*
- * allocate a block in the cache on which to store a page
- * - we return:
- * -ENOMEM - out of memory, nothing done
- * -ERESTARTSYS - interrupted
- * -ENOBUFS - no backing object available in which to cache the block
- * 0 - block allocated
- */
-int __fscache_alloc_page(struct fscache_cookie *cookie,
- struct page *page,
- gfp_t gfp)
-{
- struct fscache_retrieval *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
-
- _enter("%p,%p,,,", cookie, page);
-
- fscache_stat(&fscache_n_allocs);
-
- if (hlist_empty(&cookie->backing_objects))
- goto nobufs;
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
- ASSERTCMP(page, !=, NULL);
-
- if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
- _leave(" = -ENOBUFS [invalidating]");
- return -ENOBUFS;
- }
-
- if (fscache_wait_for_deferred_lookup(cookie) < 0)
- return -ERESTARTSYS;
-
- op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
- if (!op)
- return -ENOMEM;
- atomic_set(&op->n_pages, 1);
- trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one);
-
- spin_lock(&cookie->lock);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs_unlock;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- __fscache_use_cookie(cookie);
- if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock_dec;
- spin_unlock(&cookie->lock);
-
- fscache_stat(&fscache_n_alloc_ops);
-
- ret = fscache_wait_for_operation_activation(
- object, &op->op,
- __fscache_stat(&fscache_n_alloc_op_waits),
- __fscache_stat(&fscache_n_allocs_object_dead));
- if (ret < 0)
- goto error;
-
- /* ask the cache to honour the operation */
- fscache_stat(&fscache_n_cop_allocate_page);
- ret = object->cache->ops->allocate_page(op, page, gfp);
- fscache_stat_d(&fscache_n_cop_allocate_page);
-
-error:
- if (ret == -ERESTARTSYS)
- fscache_stat(&fscache_n_allocs_intr);
- else if (ret < 0)
- fscache_stat(&fscache_n_allocs_nobufs);
- else
- fscache_stat(&fscache_n_allocs_ok);
-
- fscache_put_retrieval(op);
- _leave(" = %d", ret);
- return ret;
-
-nobufs_unlock_dec:
- wake_cookie = __fscache_unuse_cookie(cookie);
-nobufs_unlock:
- spin_unlock(&cookie->lock);
- fscache_put_retrieval(op);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
-nobufs:
- fscache_stat(&fscache_n_allocs_nobufs);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
-}
-EXPORT_SYMBOL(__fscache_alloc_page);
-
-/*
- * Unmark pages allocate in the readahead code path (via:
- * fscache_readpages_or_alloc) after delegating to the base filesystem
- */
-void __fscache_readpages_cancel(struct fscache_cookie *cookie,
- struct list_head *pages)
-{
- struct page *page;
-
- list_for_each_entry(page, pages, lru) {
- if (PageFsCache(page))
- __fscache_uncache_page(cookie, page);
- }
-}
-EXPORT_SYMBOL(__fscache_readpages_cancel);
-
-/*
- * release a write op reference
- */
-static void fscache_release_write_op(struct fscache_operation *_op)
-{
- _enter("{OP%x}", _op->debug_id);
-}
-
-/*
- * perform the background storage of a page into the cache
- */
-static void fscache_write_op(struct fscache_operation *_op)
-{
- struct fscache_storage *op =
- container_of(_op, struct fscache_storage, op);
- struct fscache_object *object = op->op.object;
- struct fscache_cookie *cookie;
- struct page *page;
- unsigned n;
- void *results[1];
- int ret;
-
- _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
-
-again:
- spin_lock(&object->lock);
- cookie = object->cookie;
-
- if (!fscache_object_is_active(object)) {
- /* If we get here, then the on-disk cache object likely no
- * longer exists, so we should just cancel this write
- * operation.
- */
- spin_unlock(&object->lock);
- fscache_op_complete(&op->op, true);
- _leave(" [inactive]");
- return;
- }
-
- if (!cookie) {
- /* If we get here, then the cookie belonging to the object was
- * detached, probably by the cookie being withdrawn due to
- * memory pressure, which means that the pages we might write
- * to the cache from no longer exist - therefore, we can just
- * cancel this write operation.
- */
- spin_unlock(&object->lock);
- fscache_op_complete(&op->op, true);
- _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
- _op->flags, _op->state, object->state->short_name,
- object->flags);
- return;
- }
-
- spin_lock(&cookie->stores_lock);
-
- fscache_stat(&fscache_n_store_calls);
-
- /* find a page to store */
- results[0] = NULL;
- page = NULL;
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
- FSCACHE_COOKIE_PENDING_TAG);
- trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit);
- if (n != 1)
- goto superseded;
- page = results[0];
- _debug("gang %d [%lx]", n, page->index);
-
- radix_tree_tag_set(&cookie->stores, page->index,
- FSCACHE_COOKIE_STORING_TAG);
- radix_tree_tag_clear(&cookie->stores, page->index,
- FSCACHE_COOKIE_PENDING_TAG);
- trace_fscache_page(cookie, page, fscache_page_radix_pend2store);
-
- spin_unlock(&cookie->stores_lock);
- spin_unlock(&object->lock);
-
- if (page->index >= op->store_limit)
- goto discard_page;
-
- fscache_stat(&fscache_n_store_pages);
- fscache_stat(&fscache_n_cop_write_page);
- ret = object->cache->ops->write_page(op, page);
- fscache_stat_d(&fscache_n_cop_write_page);
- trace_fscache_wrote_page(cookie, page, &op->op, ret);
- fscache_end_page_write(object, page);
- if (ret < 0) {
- fscache_abort_object(object);
- fscache_op_complete(&op->op, true);
- } else {
- fscache_enqueue_operation(&op->op);
- }
-
- _leave("");
- return;
-
-discard_page:
- fscache_stat(&fscache_n_store_pages_over_limit);
- trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS);
- fscache_end_page_write(object, page);
- goto again;
-
-superseded:
- /* this writer is going away and there aren't any more things to
- * write */
- _debug("cease");
- spin_unlock(&cookie->stores_lock);
- clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
- spin_unlock(&object->lock);
- fscache_op_complete(&op->op, false);
- _leave("");
-}
-
-/*
- * Clear the pages pending writing for invalidation
- */
-void fscache_invalidate_writes(struct fscache_cookie *cookie)
-{
- struct page *page;
- void *results[16];
- int n, i;
-
- _enter("");
-
- for (;;) {
- spin_lock(&cookie->stores_lock);
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
- ARRAY_SIZE(results),
- FSCACHE_COOKIE_PENDING_TAG);
- if (n == 0) {
- spin_unlock(&cookie->stores_lock);
- break;
- }
-
- for (i = n - 1; i >= 0; i--) {
- page = results[i];
- radix_tree_delete(&cookie->stores, page->index);
- trace_fscache_page(cookie, page, fscache_page_radix_delete);
- trace_fscache_page(cookie, page, fscache_page_inval);
- }
-
- spin_unlock(&cookie->stores_lock);
-
- for (i = n - 1; i >= 0; i--)
- put_page(results[i]);
- }
-
- wake_up_bit(&cookie->flags, 0);
- trace_fscache_wake_cookie(cookie);
-
- _leave("");
-}
-
-/*
- * request a page be stored in the cache
- * - returns:
- * -ENOMEM - out of memory, nothing done
- * -ENOBUFS - no backing object available in which to cache the page
- * 0 - dispatched a write - it'll call end_io_func() when finished
- *
- * if the cookie still has a backing object at this point, that object can be
- * in one of a few states with respect to storage processing:
- *
- * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
- * set)
- *
- * (a) no writes yet
- *
- * (b) writes deferred till post-creation (mark page for writing and
- * return immediately)
- *
- * (2) negative lookup, object created, initial fill being made from netfs
- *
- * (a) fill point not yet reached this page (mark page for writing and
- * return)
- *
- * (b) fill point passed this page (queue op to store this page)
- *
- * (3) object extant (queue op to store this page)
- *
- * any other state is invalid
- */
-int __fscache_write_page(struct fscache_cookie *cookie,
- struct page *page,
- loff_t object_size,
- gfp_t gfp)
-{
- struct fscache_storage *op;
- struct fscache_object *object;
- bool wake_cookie = false;
- int ret;
-
- _enter("%p,%x,", cookie, (u32) page->flags);
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
- ASSERT(PageFsCache(page));
-
- fscache_stat(&fscache_n_stores);
-
- if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
- _leave(" = -ENOBUFS [invalidating]");
- return -ENOBUFS;
- }
-
- op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
- if (!op)
- goto nomem;
-
- fscache_operation_init(cookie, &op->op, fscache_write_op, NULL,
- fscache_release_write_op);
- op->op.flags = FSCACHE_OP_ASYNC |
- (1 << FSCACHE_OP_WAITING) |
- (1 << FSCACHE_OP_UNUSE_COOKIE);
-
- ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
- if (ret < 0)
- goto nomem_free;
-
- trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one);
-
- ret = -ENOBUFS;
- spin_lock(&cookie->lock);
-
- if (!fscache_cookie_enabled(cookie) ||
- hlist_empty(&cookie->backing_objects))
- goto nobufs;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
- if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
- goto nobufs;
-
- trace_fscache_page(cookie, page, fscache_page_write);
-
- /* add the page to the pending-storage radix tree on the backing
- * object */
- spin_lock(&object->lock);
-
- if (object->store_limit_l != object_size)
- fscache_set_store_limit(object, object_size);
-
- spin_lock(&cookie->stores_lock);
-
- _debug("store limit %llx", (unsigned long long) object->store_limit);
-
- ret = radix_tree_insert(&cookie->stores, page->index, page);
- if (ret < 0) {
- if (ret == -EEXIST)
- goto already_queued;
- _debug("insert failed %d", ret);
- goto nobufs_unlock_obj;
- }
-
- trace_fscache_page(cookie, page, fscache_page_radix_insert);
- radix_tree_tag_set(&cookie->stores, page->index,
- FSCACHE_COOKIE_PENDING_TAG);
- trace_fscache_page(cookie, page, fscache_page_radix_set_pend);
- get_page(page);
-
- /* we only want one writer at a time, but we do need to queue new
- * writers after exclusive ops */
- if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
- goto already_pending;
-
- spin_unlock(&cookie->stores_lock);
- spin_unlock(&object->lock);
-
- op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
- op->store_limit = object->store_limit;
-
- __fscache_use_cookie(cookie);
- if (fscache_submit_op(object, &op->op) < 0)
- goto submit_failed;
-
- spin_unlock(&cookie->lock);
- radix_tree_preload_end();
- fscache_stat(&fscache_n_store_ops);
- fscache_stat(&fscache_n_stores_ok);
-
- /* the work queue now carries its own ref on the object */
- fscache_put_operation(&op->op);
- _leave(" = 0");
- return 0;
-
-already_queued:
- fscache_stat(&fscache_n_stores_again);
-already_pending:
- spin_unlock(&cookie->stores_lock);
- spin_unlock(&object->lock);
- spin_unlock(&cookie->lock);
- radix_tree_preload_end();
- fscache_put_operation(&op->op);
- fscache_stat(&fscache_n_stores_ok);
- _leave(" = 0");
- return 0;
-
-submit_failed:
- spin_lock(&cookie->stores_lock);
- radix_tree_delete(&cookie->stores, page->index);
- trace_fscache_page(cookie, page, fscache_page_radix_delete);
- spin_unlock(&cookie->stores_lock);
- wake_cookie = __fscache_unuse_cookie(cookie);
- put_page(page);
- ret = -ENOBUFS;
- goto nobufs;
-
-nobufs_unlock_obj:
- spin_unlock(&cookie->stores_lock);
- spin_unlock(&object->lock);
-nobufs:
- spin_unlock(&cookie->lock);
- radix_tree_preload_end();
- fscache_put_operation(&op->op);
- if (wake_cookie)
- __fscache_wake_unused_cookie(cookie);
- fscache_stat(&fscache_n_stores_nobufs);
- _leave(" = -ENOBUFS");
- return -ENOBUFS;
-
-nomem_free:
- fscache_put_operation(&op->op);
-nomem:
- fscache_stat(&fscache_n_stores_oom);
- _leave(" = -ENOMEM");
- return -ENOMEM;
-}
-EXPORT_SYMBOL(__fscache_write_page);
-
-/*
- * remove a page from the cache
- */
-void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
-{
- struct fscache_object *object;
-
- _enter(",%p", page);
-
- ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
- ASSERTCMP(page, !=, NULL);
-
- fscache_stat(&fscache_n_uncaches);
-
- /* cache withdrawal may beat us to it */
- if (!PageFsCache(page))
- goto done;
-
- trace_fscache_page(cookie, page, fscache_page_uncache);
-
- /* get the object */
- spin_lock(&cookie->lock);
-
- if (hlist_empty(&cookie->backing_objects)) {
- ClearPageFsCache(page);
- goto done_unlock;
- }
-
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object, cookie_link);
-
- /* there might now be stuff on disk we could read */
- clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
-
- /* only invoke the cache backend if we managed to mark the page
- * uncached here; this deals with synchronisation vs withdrawal */
- if (TestClearPageFsCache(page) &&
- object->cache->ops->uncache_page) {
- /* the cache backend releases the cookie lock */
- fscache_stat(&fscache_n_cop_uncache_page);
- object->cache->ops->uncache_page(object, page);
- fscache_stat_d(&fscache_n_cop_uncache_page);
- goto done;
- }
-
-done_unlock:
- spin_unlock(&cookie->lock);
-done:
- _leave("");
-}
-EXPORT_SYMBOL(__fscache_uncache_page);
-
-/**
- * fscache_mark_page_cached - Mark a page as being cached
- * @op: The retrieval op pages are being marked for
- * @page: The page to be marked
- *
- * Mark a netfs page as being cached. After this is called, the netfs
- * must call fscache_uncache_page() to remove the mark.
- */
-void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
-{
- struct fscache_cookie *cookie = op->op.object->cookie;
-
-#ifdef CONFIG_FSCACHE_STATS
- atomic_inc(&fscache_n_marks);
-#endif
-
- trace_fscache_page(cookie, page, fscache_page_cached);
-
- _debug("- mark %p{%lx}", page, page->index);
- if (TestSetPageFsCache(page)) {
- static bool once_only;
- if (!once_only) {
- once_only = true;
- pr_warn("Cookie type %s marked page %lx multiple times\n",
- cookie->def->name, page->index);
- }
- }
-
- if (cookie->def->mark_page_cached)
- cookie->def->mark_page_cached(cookie->netfs_data,
- op->mapping, page);
-}
-EXPORT_SYMBOL(fscache_mark_page_cached);
-
-/**
- * fscache_mark_pages_cached - Mark pages as being cached
- * @op: The retrieval op pages are being marked for
- * @pagevec: The pages to be marked
- *
- * Mark a bunch of netfs pages as being cached. After this is called,
- * the netfs must call fscache_uncache_page() to remove the mark.
- */
-void fscache_mark_pages_cached(struct fscache_retrieval *op,
- struct pagevec *pagevec)
-{
- unsigned long loop;
-
- for (loop = 0; loop < pagevec->nr; loop++)
- fscache_mark_page_cached(op, pagevec->pages[loop]);
-
- pagevec_reinit(pagevec);
-}
-EXPORT_SYMBOL(fscache_mark_pages_cached);
-
-/*
- * Uncache all the pages in an inode that are marked PG_fscache, assuming them
- * to be associated with the given cookie.
- */
-void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
- struct inode *inode)
-{
- struct address_space *mapping = inode->i_mapping;
- struct pagevec pvec;
- pgoff_t next;
- int i;
-
- _enter("%p,%p", cookie, inode);
-
- if (!mapping || mapping->nrpages == 0) {
- _leave(" [no pages]");
- return;
- }
-
- pagevec_init(&pvec);
- next = 0;
- do {
- if (!pagevec_lookup(&pvec, mapping, &next))
- break;
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- if (PageFsCache(page)) {
- __fscache_wait_on_page_write(cookie, page);
- __fscache_uncache_page(cookie, page);
- }
- }
- pagevec_release(&pvec);
- cond_resched();
- } while (next);
-
- _leave("");
-}
-EXPORT_SYMBOL(__fscache_uncache_all_inode_pages);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 061df8f61ffc..dc3b0e9c8cce 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -1,11 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* FS-Cache statistics viewing interface
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
-#define FSCACHE_DEBUG_LEVEL OPERATION
+#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -16,42 +16,32 @@
*/
int __init fscache_proc_init(void)
{
- _enter("");
-
if (!proc_mkdir("fs/fscache", NULL))
goto error_dir;
+ if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ &fscache_caches_seq_ops))
+ goto error;
+
+ if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ &fscache_volumes_seq_ops))
+ goto error;
+
if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
- goto error_cookies;
+ goto error;
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error_stats;
+ fscache_stats_show))
+ goto error;
#endif
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
- if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
- &fscache_objlist_proc_ops))
- goto error_objects;
-#endif
-
- _leave(" = 0");
return 0;
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
-error_objects:
-#endif
-#ifdef CONFIG_FSCACHE_STATS
- remove_proc_entry("fs/fscache/stats", NULL);
-error_stats:
-#endif
- remove_proc_entry("fs/fscache/cookies", NULL);
-error_cookies:
+error:
remove_proc_entry("fs/fscache", NULL);
error_dir:
- _leave(" = -ENOMEM");
return -ENOMEM;
}
@@ -60,12 +50,5 @@ error_dir:
*/
void fscache_proc_cleanup(void)
{
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
- remove_proc_entry("fs/fscache/objects", NULL);
-#endif
-#ifdef CONFIG_FSCACHE_STATS
- remove_proc_entry("fs/fscache/stats", NULL);
-#endif
- remove_proc_entry("fs/fscache/cookies", NULL);
- remove_proc_entry("fs/fscache", NULL);
+ remove_proc_subtree("fs/fscache", NULL);
}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index a7c3ed89a3e0..fc94e5e79f1c 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* FS-Cache statistics
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
-#define FSCACHE_DEBUG_LEVEL THREAD
-#include <linux/module.h>
+#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -14,122 +13,41 @@
/*
* operation counters
*/
-atomic_t fscache_n_op_pend;
-atomic_t fscache_n_op_run;
-atomic_t fscache_n_op_enqueue;
-atomic_t fscache_n_op_deferred_release;
-atomic_t fscache_n_op_initialised;
-atomic_t fscache_n_op_release;
-atomic_t fscache_n_op_gc;
-atomic_t fscache_n_op_cancelled;
-atomic_t fscache_n_op_rejected;
-
-atomic_t fscache_n_attr_changed;
-atomic_t fscache_n_attr_changed_ok;
-atomic_t fscache_n_attr_changed_nobufs;
-atomic_t fscache_n_attr_changed_nomem;
-atomic_t fscache_n_attr_changed_calls;
-
-atomic_t fscache_n_allocs;
-atomic_t fscache_n_allocs_ok;
-atomic_t fscache_n_allocs_wait;
-atomic_t fscache_n_allocs_nobufs;
-atomic_t fscache_n_allocs_intr;
-atomic_t fscache_n_allocs_object_dead;
-atomic_t fscache_n_alloc_ops;
-atomic_t fscache_n_alloc_op_waits;
-
-atomic_t fscache_n_retrievals;
-atomic_t fscache_n_retrievals_ok;
-atomic_t fscache_n_retrievals_wait;
-atomic_t fscache_n_retrievals_nodata;
-atomic_t fscache_n_retrievals_nobufs;
-atomic_t fscache_n_retrievals_intr;
-atomic_t fscache_n_retrievals_nomem;
-atomic_t fscache_n_retrievals_object_dead;
-atomic_t fscache_n_retrieval_ops;
-atomic_t fscache_n_retrieval_op_waits;
-
-atomic_t fscache_n_stores;
-atomic_t fscache_n_stores_ok;
-atomic_t fscache_n_stores_again;
-atomic_t fscache_n_stores_nobufs;
-atomic_t fscache_n_stores_oom;
-atomic_t fscache_n_store_ops;
-atomic_t fscache_n_store_calls;
-atomic_t fscache_n_store_pages;
-atomic_t fscache_n_store_radix_deletes;
-atomic_t fscache_n_store_pages_over_limit;
-
-atomic_t fscache_n_store_vmscan_not_storing;
-atomic_t fscache_n_store_vmscan_gone;
-atomic_t fscache_n_store_vmscan_busy;
-atomic_t fscache_n_store_vmscan_cancelled;
-atomic_t fscache_n_store_vmscan_wait;
-
-atomic_t fscache_n_marks;
-atomic_t fscache_n_uncaches;
+atomic_t fscache_n_volumes;
+atomic_t fscache_n_volumes_collision;
+atomic_t fscache_n_volumes_nomem;
+atomic_t fscache_n_cookies;
+atomic_t fscache_n_cookies_lru;
+atomic_t fscache_n_cookies_lru_expired;
+atomic_t fscache_n_cookies_lru_removed;
+atomic_t fscache_n_cookies_lru_dropped;
atomic_t fscache_n_acquires;
-atomic_t fscache_n_acquires_null;
-atomic_t fscache_n_acquires_no_cache;
atomic_t fscache_n_acquires_ok;
-atomic_t fscache_n_acquires_nobufs;
atomic_t fscache_n_acquires_oom;
atomic_t fscache_n_invalidates;
-atomic_t fscache_n_invalidates_run;
atomic_t fscache_n_updates;
-atomic_t fscache_n_updates_null;
-atomic_t fscache_n_updates_run;
+EXPORT_SYMBOL(fscache_n_updates);
atomic_t fscache_n_relinquishes;
-atomic_t fscache_n_relinquishes_null;
-atomic_t fscache_n_relinquishes_waitcrt;
atomic_t fscache_n_relinquishes_retire;
-
-atomic_t fscache_n_cookie_index;
-atomic_t fscache_n_cookie_data;
-atomic_t fscache_n_cookie_special;
-
-atomic_t fscache_n_object_alloc;
-atomic_t fscache_n_object_no_alloc;
-atomic_t fscache_n_object_lookups;
-atomic_t fscache_n_object_lookups_negative;
-atomic_t fscache_n_object_lookups_positive;
-atomic_t fscache_n_object_lookups_timed_out;
-atomic_t fscache_n_object_created;
-atomic_t fscache_n_object_avail;
-atomic_t fscache_n_object_dead;
-
-atomic_t fscache_n_checkaux_none;
-atomic_t fscache_n_checkaux_okay;
-atomic_t fscache_n_checkaux_update;
-atomic_t fscache_n_checkaux_obsolete;
-
-atomic_t fscache_n_cop_alloc_object;
-atomic_t fscache_n_cop_lookup_object;
-atomic_t fscache_n_cop_lookup_complete;
-atomic_t fscache_n_cop_grab_object;
-atomic_t fscache_n_cop_invalidate_object;
-atomic_t fscache_n_cop_update_object;
-atomic_t fscache_n_cop_drop_object;
-atomic_t fscache_n_cop_put_object;
-atomic_t fscache_n_cop_sync_cache;
-atomic_t fscache_n_cop_attr_changed;
-atomic_t fscache_n_cop_read_or_alloc_page;
-atomic_t fscache_n_cop_read_or_alloc_pages;
-atomic_t fscache_n_cop_allocate_page;
-atomic_t fscache_n_cop_allocate_pages;
-atomic_t fscache_n_cop_write_page;
-atomic_t fscache_n_cop_uncache_page;
-atomic_t fscache_n_cop_dissociate_pages;
-
-atomic_t fscache_n_cache_no_space_reject;
-atomic_t fscache_n_cache_stale_objects;
-atomic_t fscache_n_cache_retired_objects;
-atomic_t fscache_n_cache_culled_objects;
+atomic_t fscache_n_relinquishes_dropped;
+
+atomic_t fscache_n_resizes;
+atomic_t fscache_n_resizes_null;
+
+atomic_t fscache_n_read;
+EXPORT_SYMBOL(fscache_n_read);
+atomic_t fscache_n_write;
+EXPORT_SYMBOL(fscache_n_write);
+atomic_t fscache_n_no_write_space;
+EXPORT_SYMBOL(fscache_n_no_write_space);
+atomic_t fscache_n_no_create_space;
+EXPORT_SYMBOL(fscache_n_no_create_space);
+atomic_t fscache_n_culled;
+EXPORT_SYMBOL(fscache_n_culled);
/*
* display the general statistics
@@ -137,147 +55,48 @@ atomic_t fscache_n_cache_culled_objects;
int fscache_stats_show(struct seq_file *m, void *v)
{
seq_puts(m, "FS-Cache statistics\n");
-
- seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
- atomic_read(&fscache_n_cookie_index),
- atomic_read(&fscache_n_cookie_data),
- atomic_read(&fscache_n_cookie_special));
-
- seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
- atomic_read(&fscache_n_object_alloc),
- atomic_read(&fscache_n_object_no_alloc),
- atomic_read(&fscache_n_object_avail),
- atomic_read(&fscache_n_object_dead));
- seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
- atomic_read(&fscache_n_checkaux_none),
- atomic_read(&fscache_n_checkaux_okay),
- atomic_read(&fscache_n_checkaux_update),
- atomic_read(&fscache_n_checkaux_obsolete));
-
- seq_printf(m, "Pages : mrk=%u unc=%u\n",
- atomic_read(&fscache_n_marks),
- atomic_read(&fscache_n_uncaches));
-
- seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
- " oom=%u\n",
+ seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
+ atomic_read(&fscache_n_cookies),
+ atomic_read(&fscache_n_volumes),
+ atomic_read(&fscache_n_volumes_collision),
+ atomic_read(&fscache_n_volumes_nomem)
+ );
+
+ seq_printf(m, "Acquire: n=%u ok=%u oom=%u\n",
atomic_read(&fscache_n_acquires),
- atomic_read(&fscache_n_acquires_null),
- atomic_read(&fscache_n_acquires_no_cache),
atomic_read(&fscache_n_acquires_ok),
- atomic_read(&fscache_n_acquires_nobufs),
atomic_read(&fscache_n_acquires_oom));
- seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
- atomic_read(&fscache_n_object_lookups),
- atomic_read(&fscache_n_object_lookups_negative),
- atomic_read(&fscache_n_object_lookups_positive),
- atomic_read(&fscache_n_object_created),
- atomic_read(&fscache_n_object_lookups_timed_out));
+ seq_printf(m, "LRU : n=%u exp=%u rmv=%u drp=%u at=%ld\n",
+ atomic_read(&fscache_n_cookies_lru),
+ atomic_read(&fscache_n_cookies_lru_expired),
+ atomic_read(&fscache_n_cookies_lru_removed),
+ atomic_read(&fscache_n_cookies_lru_dropped),
+ timer_pending(&fscache_cookie_lru_timer) ?
+ fscache_cookie_lru_timer.expires - jiffies : 0);
- seq_printf(m, "Invals : n=%u run=%u\n",
- atomic_read(&fscache_n_invalidates),
- atomic_read(&fscache_n_invalidates_run));
+ seq_printf(m, "Invals : n=%u\n",
+ atomic_read(&fscache_n_invalidates));
- seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+ seq_printf(m, "Updates: n=%u rsz=%u rsn=%u\n",
atomic_read(&fscache_n_updates),
- atomic_read(&fscache_n_updates_null),
- atomic_read(&fscache_n_updates_run));
+ atomic_read(&fscache_n_resizes),
+ atomic_read(&fscache_n_resizes_null));
- seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
+ seq_printf(m, "Relinqs: n=%u rtr=%u drop=%u\n",
atomic_read(&fscache_n_relinquishes),
- atomic_read(&fscache_n_relinquishes_null),
- atomic_read(&fscache_n_relinquishes_waitcrt),
- atomic_read(&fscache_n_relinquishes_retire));
-
- seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
- atomic_read(&fscache_n_attr_changed),
- atomic_read(&fscache_n_attr_changed_ok),
- atomic_read(&fscache_n_attr_changed_nobufs),
- atomic_read(&fscache_n_attr_changed_nomem),
- atomic_read(&fscache_n_attr_changed_calls));
-
- seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
- atomic_read(&fscache_n_allocs),
- atomic_read(&fscache_n_allocs_ok),
- atomic_read(&fscache_n_allocs_wait),
- atomic_read(&fscache_n_allocs_nobufs),
- atomic_read(&fscache_n_allocs_intr));
- seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
- atomic_read(&fscache_n_alloc_ops),
- atomic_read(&fscache_n_alloc_op_waits),
- atomic_read(&fscache_n_allocs_object_dead));
-
- seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
- " int=%u oom=%u\n",
- atomic_read(&fscache_n_retrievals),
- atomic_read(&fscache_n_retrievals_ok),
- atomic_read(&fscache_n_retrievals_wait),
- atomic_read(&fscache_n_retrievals_nodata),
- atomic_read(&fscache_n_retrievals_nobufs),
- atomic_read(&fscache_n_retrievals_intr),
- atomic_read(&fscache_n_retrievals_nomem));
- seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
- atomic_read(&fscache_n_retrieval_ops),
- atomic_read(&fscache_n_retrieval_op_waits),
- atomic_read(&fscache_n_retrievals_object_dead));
-
- seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
- atomic_read(&fscache_n_stores),
- atomic_read(&fscache_n_stores_ok),
- atomic_read(&fscache_n_stores_again),
- atomic_read(&fscache_n_stores_nobufs),
- atomic_read(&fscache_n_stores_oom));
- seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
- atomic_read(&fscache_n_store_ops),
- atomic_read(&fscache_n_store_calls),
- atomic_read(&fscache_n_store_pages),
- atomic_read(&fscache_n_store_radix_deletes),
- atomic_read(&fscache_n_store_pages_over_limit));
+ atomic_read(&fscache_n_relinquishes_retire),
+ atomic_read(&fscache_n_relinquishes_dropped));
- seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
- atomic_read(&fscache_n_store_vmscan_not_storing),
- atomic_read(&fscache_n_store_vmscan_gone),
- atomic_read(&fscache_n_store_vmscan_busy),
- atomic_read(&fscache_n_store_vmscan_cancelled),
- atomic_read(&fscache_n_store_vmscan_wait));
+ seq_printf(m, "NoSpace: nwr=%u ncr=%u cull=%u\n",
+ atomic_read(&fscache_n_no_write_space),
+ atomic_read(&fscache_n_no_create_space),
+ atomic_read(&fscache_n_culled));
- seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
- atomic_read(&fscache_n_op_pend),
- atomic_read(&fscache_n_op_run),
- atomic_read(&fscache_n_op_enqueue),
- atomic_read(&fscache_n_op_cancelled),
- atomic_read(&fscache_n_op_rejected));
- seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n",
- atomic_read(&fscache_n_op_initialised),
- atomic_read(&fscache_n_op_deferred_release),
- atomic_read(&fscache_n_op_release),
- atomic_read(&fscache_n_op_gc));
+ seq_printf(m, "IO : rd=%u wr=%u\n",
+ atomic_read(&fscache_n_read),
+ atomic_read(&fscache_n_write));
- seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
- atomic_read(&fscache_n_cop_alloc_object),
- atomic_read(&fscache_n_cop_lookup_object),
- atomic_read(&fscache_n_cop_lookup_complete),
- atomic_read(&fscache_n_cop_grab_object));
- seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
- atomic_read(&fscache_n_cop_invalidate_object),
- atomic_read(&fscache_n_cop_update_object),
- atomic_read(&fscache_n_cop_drop_object),
- atomic_read(&fscache_n_cop_put_object),
- atomic_read(&fscache_n_cop_attr_changed),
- atomic_read(&fscache_n_cop_sync_cache));
- seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
- atomic_read(&fscache_n_cop_read_or_alloc_page),
- atomic_read(&fscache_n_cop_read_or_alloc_pages),
- atomic_read(&fscache_n_cop_allocate_page),
- atomic_read(&fscache_n_cop_allocate_pages),
- atomic_read(&fscache_n_cop_write_page),
- atomic_read(&fscache_n_cop_uncache_page),
- atomic_read(&fscache_n_cop_dissociate_pages));
- seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n",
- atomic_read(&fscache_n_cache_no_space_reject),
- atomic_read(&fscache_n_cache_stale_objects),
- atomic_read(&fscache_n_cache_retired_objects),
- atomic_read(&fscache_n_cache_culled_objects));
netfs_stats_show(m);
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
new file mode 100644
index 000000000000..f2aa7dbad766
--- /dev/null
+++ b/fs/fscache/volume.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Volume-level cache cookie handling.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/export.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+#define fscache_volume_hash_shift 10
+static struct hlist_bl_head fscache_volume_hash[1 << fscache_volume_hash_shift];
+static atomic_t fscache_volume_debug_id;
+static LIST_HEAD(fscache_volumes);
+
+static void fscache_create_volume_work(struct work_struct *work);
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ int ref;
+
+ __refcount_inc(&volume->ref, &ref);
+ trace_fscache_volume(volume->debug_id, ref + 1, where);
+ return volume;
+}
+
+static void fscache_see_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ int ref = refcount_read(&volume->ref);
+
+ trace_fscache_volume(volume->debug_id, ref, where);
+}
+
+/*
+ * Pin the cache behind a volume so that we can access it.
+ */
+static void __fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ n_accesses = atomic_inc_return(&volume->n_accesses);
+ smp_mb__after_atomic();
+ trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0,
+ refcount_read(&volume->ref),
+ n_accesses, why);
+}
+
+/**
+ * fscache_begin_volume_access - Pin a cache so a volume can be accessed
+ * @volume: The volume cookie
+ * @cookie: A datafile cookie for a tracing reference (or NULL)
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Attempt to pin the cache to prevent it from going away whilst we're
+ * accessing a volume and returns true if successful. This works as follows:
+ *
+ * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE),
+ * then we return false to indicate access was not permitted.
+ *
+ * (2) If the cache tests as live, then we increment the volume's n_accesses
+ * count and then recheck the cache liveness, ending the access if it
+ * ceased to be live.
+ *
+ * (3) When we end the access, we decrement the volume's n_accesses and wake
+ * up the any waiters if it reaches 0.
+ *
+ * (4) Whilst the cache is caching, the volume's n_accesses is kept
+ * artificially incremented to prevent wakeups from happening.
+ *
+ * (5) When the cache is taken offline, the state is changed to prevent new
+ * accesses, the volume's n_accesses is decremented and we wait for it to
+ * become 0.
+ *
+ * The datafile @cookie and the @why indicator are merely provided for tracing
+ * purposes.
+ */
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ if (!fscache_cache_is_live(volume->cache))
+ return false;
+ __fscache_begin_volume_access(volume, cookie, why);
+ if (!fscache_cache_is_live(volume->cache)) {
+ fscache_end_volume_access(volume, cookie, fscache_access_unlive);
+ return false;
+ }
+ return true;
+}
+
+/**
+ * fscache_end_volume_access - Unpin a cache at the end of an access.
+ * @volume: The volume cookie
+ * @cookie: A datafile cookie for a tracing reference (or NULL)
+ * @why: An indication of the circumstances of the access for tracing
+ *
+ * Unpin a cache volume after we've accessed it. The datafile @cookie and the
+ * @why indicator are merely provided for tracing purposes.
+ */
+void fscache_end_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why)
+{
+ int n_accesses;
+
+ smp_mb__before_atomic();
+ n_accesses = atomic_dec_return(&volume->n_accesses);
+ trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0,
+ refcount_read(&volume->ref),
+ n_accesses, why);
+ if (n_accesses == 0)
+ wake_up_var(&volume->n_accesses);
+}
+EXPORT_SYMBOL(fscache_end_volume_access);
+
+static bool fscache_volume_same(const struct fscache_volume *a,
+ const struct fscache_volume *b)
+{
+ size_t klen;
+
+ if (a->key_hash != b->key_hash ||
+ a->cache != b->cache ||
+ a->key[0] != b->key[0])
+ return false;
+
+ klen = round_up(a->key[0] + 1, sizeof(__le32));
+ return memcmp(a->key, b->key, klen) == 0;
+}
+
+static bool fscache_is_acquire_pending(struct fscache_volume *volume)
+{
+ return test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &volume->flags);
+}
+
+static void fscache_wait_on_volume_collision(struct fscache_volume *candidate,
+ unsigned int collidee_debug_id)
+{
+ wait_var_event_timeout(&candidate->flags,
+ !fscache_is_acquire_pending(candidate), 20 * HZ);
+ if (!fscache_is_acquire_pending(candidate)) {
+ pr_notice("Potential volume collision new=%08x old=%08x",
+ candidate->debug_id, collidee_debug_id);
+ fscache_stat(&fscache_n_volumes_collision);
+ wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate));
+ }
+}
+
+/*
+ * Attempt to insert the new volume into the hash. If there's a collision, we
+ * wait for the old volume to complete if it's being relinquished and an error
+ * otherwise.
+ */
+static bool fscache_hash_volume(struct fscache_volume *candidate)
+{
+ struct fscache_volume *cursor;
+ struct hlist_bl_head *h;
+ struct hlist_bl_node *p;
+ unsigned int bucket, collidee_debug_id = 0;
+
+ bucket = candidate->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1);
+ h = &fscache_volume_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_for_each_entry(cursor, p, h, hash_link) {
+ if (fscache_volume_same(candidate, cursor)) {
+ if (!test_bit(FSCACHE_VOLUME_RELINQUISHED, &cursor->flags))
+ goto collision;
+ fscache_see_volume(cursor, fscache_volume_get_hash_collision);
+ set_bit(FSCACHE_VOLUME_COLLIDED_WITH, &cursor->flags);
+ set_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags);
+ collidee_debug_id = cursor->debug_id;
+ break;
+ }
+ }
+
+ hlist_bl_add_head(&candidate->hash_link, h);
+ hlist_bl_unlock(h);
+
+ if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags))
+ fscache_wait_on_volume_collision(candidate, collidee_debug_id);
+ return true;
+
+collision:
+ fscache_see_volume(cursor, fscache_volume_collision);
+ hlist_bl_unlock(h);
+ return false;
+}
+
+/*
+ * Allocate and initialise a volume representation cookie.
+ */
+static struct fscache_volume *fscache_alloc_volume(const char *volume_key,
+ const char *cache_name,
+ const void *coherency_data,
+ size_t coherency_len)
+{
+ struct fscache_volume *volume;
+ struct fscache_cache *cache;
+ size_t klen, hlen;
+ char *key;
+
+ if (!coherency_data)
+ coherency_len = 0;
+
+ cache = fscache_lookup_cache(cache_name, false);
+ if (IS_ERR(cache))
+ return NULL;
+
+ volume = kzalloc(struct_size(volume, coherency, coherency_len),
+ GFP_KERNEL);
+ if (!volume)
+ goto err_cache;
+
+ volume->cache = cache;
+ volume->coherency_len = coherency_len;
+ if (coherency_data)
+ memcpy(volume->coherency, coherency_data, coherency_len);
+ INIT_LIST_HEAD(&volume->proc_link);
+ INIT_WORK(&volume->work, fscache_create_volume_work);
+ refcount_set(&volume->ref, 1);
+ spin_lock_init(&volume->lock);
+
+ /* Stick the length on the front of the key and pad it out to make
+ * hashing easier.
+ */
+ klen = strlen(volume_key);
+ hlen = round_up(1 + klen + 1, sizeof(__le32));
+ key = kzalloc(hlen, GFP_KERNEL);
+ if (!key)
+ goto err_vol;
+ key[0] = klen;
+ memcpy(key + 1, volume_key, klen);
+
+ volume->key = key;
+ volume->key_hash = fscache_hash(0, key, hlen);
+
+ volume->debug_id = atomic_inc_return(&fscache_volume_debug_id);
+ down_write(&fscache_addremove_sem);
+ atomic_inc(&cache->n_volumes);
+ list_add_tail(&volume->proc_link, &fscache_volumes);
+ fscache_see_volume(volume, fscache_volume_new_acquire);
+ fscache_stat(&fscache_n_volumes);
+ up_write(&fscache_addremove_sem);
+ _leave(" = v=%x", volume->debug_id);
+ return volume;
+
+err_vol:
+ kfree(volume);
+err_cache:
+ fscache_put_cache(cache, fscache_cache_put_alloc_volume);
+ fscache_stat(&fscache_n_volumes_nomem);
+ return NULL;
+}
+
+/*
+ * Create a volume's representation on disk. Have a volume ref and a cache
+ * access we have to release.
+ */
+static void fscache_create_volume_work(struct work_struct *work)
+{
+ const struct fscache_cache_ops *ops;
+ struct fscache_volume *volume =
+ container_of(work, struct fscache_volume, work);
+
+ fscache_see_volume(volume, fscache_volume_see_create_work);
+
+ ops = volume->cache->ops;
+ if (ops->acquire_volume)
+ ops->acquire_volume(volume);
+ fscache_end_cache_access(volume->cache,
+ fscache_access_acquire_volume_end);
+
+ clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
+ wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+ fscache_put_volume(volume, fscache_volume_put_create_work);
+}
+
+/*
+ * Dispatch a worker thread to create a volume's representation on disk.
+ */
+void fscache_create_volume(struct fscache_volume *volume, bool wait)
+{
+ if (test_and_set_bit(FSCACHE_VOLUME_CREATING, &volume->flags))
+ goto maybe_wait;
+ if (volume->cache_priv)
+ goto no_wait; /* We raced */
+ if (!fscache_begin_cache_access(volume->cache,
+ fscache_access_acquire_volume))
+ goto no_wait;
+
+ fscache_get_volume(volume, fscache_volume_get_create_work);
+ if (!schedule_work(&volume->work))
+ fscache_put_volume(volume, fscache_volume_put_create_work);
+
+maybe_wait:
+ if (wait) {
+ fscache_see_volume(volume, fscache_volume_wait_create_work);
+ wait_on_bit(&volume->flags, FSCACHE_VOLUME_CREATING,
+ TASK_UNINTERRUPTIBLE);
+ }
+ return;
+no_wait:
+ clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
+ wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+}
+
+/*
+ * Acquire a volume representation cookie and link it to a (proposed) cache.
+ */
+struct fscache_volume *__fscache_acquire_volume(const char *volume_key,
+ const char *cache_name,
+ const void *coherency_data,
+ size_t coherency_len)
+{
+ struct fscache_volume *volume;
+
+ volume = fscache_alloc_volume(volume_key, cache_name,
+ coherency_data, coherency_len);
+ if (!volume)
+ return ERR_PTR(-ENOMEM);
+
+ if (!fscache_hash_volume(volume)) {
+ fscache_put_volume(volume, fscache_volume_put_hash_collision);
+ return ERR_PTR(-EBUSY);
+ }
+
+ fscache_create_volume(volume, false);
+ return volume;
+}
+EXPORT_SYMBOL(__fscache_acquire_volume);
+
+static void fscache_wake_pending_volume(struct fscache_volume *volume,
+ struct hlist_bl_head *h)
+{
+ struct fscache_volume *cursor;
+ struct hlist_bl_node *p;
+
+ hlist_bl_for_each_entry(cursor, p, h, hash_link) {
+ if (fscache_volume_same(cursor, volume)) {
+ fscache_see_volume(cursor, fscache_volume_see_hash_wake);
+ clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags);
+ wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING);
+ return;
+ }
+ }
+}
+
+/*
+ * Remove a volume cookie from the hash table.
+ */
+static void fscache_unhash_volume(struct fscache_volume *volume)
+{
+ struct hlist_bl_head *h;
+ unsigned int bucket;
+
+ bucket = volume->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1);
+ h = &fscache_volume_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_del(&volume->hash_link);
+ if (test_bit(FSCACHE_VOLUME_COLLIDED_WITH, &volume->flags))
+ fscache_wake_pending_volume(volume, h);
+ hlist_bl_unlock(h);
+}
+
+/*
+ * Drop a cache's volume attachments.
+ */
+static void fscache_free_volume(struct fscache_volume *volume)
+{
+ struct fscache_cache *cache = volume->cache;
+
+ if (volume->cache_priv) {
+ __fscache_begin_volume_access(volume, NULL,
+ fscache_access_relinquish_volume);
+ if (volume->cache_priv)
+ cache->ops->free_volume(volume);
+ fscache_end_volume_access(volume, NULL,
+ fscache_access_relinquish_volume_end);
+ }
+
+ down_write(&fscache_addremove_sem);
+ list_del_init(&volume->proc_link);
+ atomic_dec(&volume->cache->n_volumes);
+ up_write(&fscache_addremove_sem);
+
+ if (!hlist_bl_unhashed(&volume->hash_link))
+ fscache_unhash_volume(volume);
+
+ trace_fscache_volume(volume->debug_id, 0, fscache_volume_free);
+ kfree(volume->key);
+ kfree(volume);
+ fscache_stat_d(&fscache_n_volumes);
+ fscache_put_cache(cache, fscache_cache_put_volume);
+}
+
+/*
+ * Drop a reference to a volume cookie.
+ */
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ if (volume) {
+ unsigned int debug_id = volume->debug_id;
+ bool zero;
+ int ref;
+
+ zero = __refcount_dec_and_test(&volume->ref, &ref);
+ trace_fscache_volume(debug_id, ref - 1, where);
+ if (zero)
+ fscache_free_volume(volume);
+ }
+}
+
+/*
+ * Relinquish a volume representation cookie.
+ */
+void __fscache_relinquish_volume(struct fscache_volume *volume,
+ const void *coherency_data,
+ bool invalidate)
+{
+ if (WARN_ON(test_and_set_bit(FSCACHE_VOLUME_RELINQUISHED, &volume->flags)))
+ return;
+
+ if (invalidate) {
+ set_bit(FSCACHE_VOLUME_INVALIDATE, &volume->flags);
+ } else if (coherency_data) {
+ memcpy(volume->coherency, coherency_data, volume->coherency_len);
+ }
+
+ fscache_put_volume(volume, fscache_volume_put_relinquish);
+}
+EXPORT_SYMBOL(__fscache_relinquish_volume);
+
+/**
+ * fscache_withdraw_volume - Withdraw a volume from being cached
+ * @volume: Volume cookie
+ *
+ * Withdraw a cache volume from service, waiting for all accesses to complete
+ * before returning.
+ */
+void fscache_withdraw_volume(struct fscache_volume *volume)
+{
+ int n_accesses;
+
+ _debug("withdraw V=%x", volume->debug_id);
+
+ /* Allow wakeups on dec-to-0 */
+ n_accesses = atomic_dec_return(&volume->n_accesses);
+ trace_fscache_access_volume(volume->debug_id, 0,
+ refcount_read(&volume->ref),
+ n_accesses, fscache_access_cache_unpin);
+
+ wait_var_event(&volume->n_accesses,
+ atomic_read(&volume->n_accesses) == 0);
+}
+EXPORT_SYMBOL(fscache_withdraw_volume);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Generate a list of volumes in /proc/fs/fscache/volumes
+ */
+static int fscache_volumes_seq_show(struct seq_file *m, void *v)
+{
+ struct fscache_volume *volume;
+
+ if (v == &fscache_volumes) {
+ seq_puts(m,
+ "VOLUME REF nCOOK ACC FL CACHE KEY\n"
+ "======== ===== ===== === == =============== ================\n");
+ return 0;
+ }
+
+ volume = list_entry(v, struct fscache_volume, proc_link);
+ seq_printf(m,
+ "%08x %5d %5d %3d %02lx %-15.15s %s\n",
+ volume->debug_id,
+ refcount_read(&volume->ref),
+ atomic_read(&volume->n_cookies),
+ atomic_read(&volume->n_accesses),
+ volume->flags,
+ volume->cache->name ?: "-",
+ volume->key + 1);
+ return 0;
+}
+
+static void *fscache_volumes_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&fscache_addremove_sem)
+{
+ down_read(&fscache_addremove_sem);
+ return seq_list_start_head(&fscache_volumes, *_pos);
+}
+
+static void *fscache_volumes_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &fscache_volumes, _pos);
+}
+
+static void fscache_volumes_seq_stop(struct seq_file *m, void *v)
+ __releases(&fscache_addremove_sem)
+{
+ up_read(&fscache_addremove_sem);
+}
+
+const struct seq_operations fscache_volumes_seq_ops = {
+ .start = fscache_volumes_seq_start,
+ .next = fscache_volumes_seq_next,
+ .stop = fscache_volumes_seq_stop,
+ .show = fscache_volumes_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 40ce9a1c12e5..038ed0b9aaa5 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -45,7 +45,7 @@ config FUSE_DAX
select INTERVAL_TREE
depends on VIRTIO_FS
depends on FS_DAX
- depends on DAX_DRIVER
+ depends on DAX
help
This allows bypassing guest page cache and allows mapping host page
cache directly in guest address space.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 000d2e5627e9..7cede9a3bc96 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
- struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
-
- /*
- * Get any fuse_mount belonging to this fuse_conn; s_bdi is
- * shared between all of them
- */
-
- if (!list_empty(&fc->mounts)) {
- fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
up_read(&fc->killsb);
fuse_conn_put(fc);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 281d79f8b3d3..d7d3a7f06862 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -732,11 +732,8 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
- if (ret < 0)
- return ret;
- fuse_invalidate_attr(inode);
- fuse_write_update_size(inode, iocb->ki_pos);
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
return ret;
}
@@ -1282,11 +1279,14 @@ out_err:
return ret;
}
-int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode,
+ struct dax_device *dax_dev)
{
struct fuse_conn_dax *fcd;
int err;
+ fc->dax_mode = dax_mode;
+
if (!dax_dev)
return 0;
@@ -1326,21 +1326,49 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
static const struct address_space_operations fuse_dax_file_aops = {
.writepages = fuse_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
-void fuse_dax_inode_init(struct inode *inode)
+static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
{
struct fuse_conn *fc = get_fuse_conn(inode);
+ enum fuse_dax_mode dax_mode = fc->dax_mode;
+
+ if (dax_mode == FUSE_DAX_NEVER)
+ return false;
+ /*
+ * fc->dax may be NULL in 'inode' mode when filesystem device doesn't
+ * support DAX, in which case it will silently fallback to 'never' mode.
+ */
if (!fc->dax)
+ return false;
+
+ if (dax_mode == FUSE_DAX_ALWAYS)
+ return true;
+
+ /* dax_mode is FUSE_DAX_INODE* */
+ return fc->inode_dax && (flags & FUSE_ATTR_DAX);
+}
+
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags)
+{
+ if (!fuse_should_enable_dax(inode, flags))
return;
inode->i_flags |= S_DAX;
inode->i_data.a_ops = &fuse_dax_file_aops;
}
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fuse_is_inode_dax_mode(fc->dax_mode) &&
+ ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX)))
+ d_mark_dontcache(inode);
+}
+
bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
{
if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dde341a6388a..0e537e580dc1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
@@ -756,7 +748,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
- void *pgaddr = kmap_atomic(cs->pg);
+ void *pgaddr = kmap_local_page(cs->pg);
void *buf = pgaddr + cs->offset;
if (cs->write)
@@ -764,7 +756,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
else
memcpy(*val, buf, ncpy);
- kunmap_atomic(pgaddr);
+ kunmap_local(pgaddr);
*val += ncpy;
}
*size -= ncpy;
@@ -852,6 +844,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
lru_cache_add(newpage);
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
+
err = 0;
spin_lock(&cs->req->waitq.lock);
if (test_bit(FR_ABORTED, &cs->req->flags))
@@ -935,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
@@ -949,10 +957,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
}
}
if (page) {
- void *mapaddr = kmap_atomic(page);
+ void *mapaddr = kmap_local_page(page);
void *buf = mapaddr + offset;
offset += fuse_copy_do(cs, &buf, &count);
- kunmap_atomic(mapaddr);
+ kunmap_local(mapaddr);
} else
offset += fuse_copy_do(cs, NULL, &count);
}
@@ -1591,7 +1599,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
end = outarg.offset + outarg.size;
if (end > file_size) {
file_size = end;
- fuse_write_update_size(inode, file_size);
+ fuse_write_update_attr(inode, file_size, outarg.size);
}
num = outarg.size;
@@ -2031,8 +2039,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
out_free:
- for (idx = 0; idx < nbuf; idx++)
- pipe_buf_release(pipe, &bufs[idx]);
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
pipe_unlock(pipe);
kvfree(bufs);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d9b977c0f38d..9ff27b8a9782 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -17,6 +17,9 @@
#include <linux/xattr.h>
#include <linux/iversion.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
static void fuse_advise_use_readdirplus(struct inode *dir)
{
@@ -116,7 +119,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
{
set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
}
@@ -456,6 +459,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
return ERR_PTR(err);
}
+static int get_security_context(struct dentry *entry, umode_t mode,
+ void **security_ctx, u32 *security_ctxlen)
+{
+ struct fuse_secctx *fctx;
+ struct fuse_secctx_header *header;
+ void *ctx = NULL, *ptr;
+ u32 ctxlen, total_len = sizeof(*header);
+ int err, nr_ctx = 0;
+ const char *name;
+ size_t namelen;
+
+ err = security_dentry_init_security(entry, mode, &entry->d_name,
+ &name, &ctx, &ctxlen);
+ if (err) {
+ if (err != -EOPNOTSUPP)
+ goto out_err;
+ /* No LSM is supporting this security hook. Ignore error */
+ ctxlen = 0;
+ ctx = NULL;
+ }
+
+ if (ctxlen) {
+ nr_ctx = 1;
+ namelen = strlen(name) + 1;
+ err = -EIO;
+ if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX))
+ goto out_err;
+ total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen);
+ }
+
+ err = -ENOMEM;
+ header = ptr = kzalloc(total_len, GFP_KERNEL);
+ if (!ptr)
+ goto out_err;
+
+ header->nr_secctx = nr_ctx;
+ header->size = total_len;
+ ptr += sizeof(*header);
+ if (nr_ctx) {
+ fctx = ptr;
+ fctx->size = ctxlen;
+ ptr += sizeof(*fctx);
+
+ strcpy(ptr, name);
+ ptr += namelen;
+
+ memcpy(ptr, ctx, ctxlen);
+ }
+ *security_ctxlen = total_len;
+ *security_ctx = header;
+ err = 0;
+out_err:
+ kfree(ctx);
+ return err;
+}
+
/*
* Atomic create+open operation
*
@@ -476,6 +535,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -517,7 +578,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[0].value = &outentry;
args.out_args[1].size = sizeof(outopen);
args.out_args[1].value = &outopen;
+
+ if (fm->fc->init_security) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ args.in_numargs = 3;
+ args.in_args[2].size = security_ctxlen;
+ args.in_args[2].value = security_ctx;
+ }
+
err = fuse_simple_request(fm, &args);
+ kfree(security_ctx);
if (err)
goto out_free_ff;
@@ -620,6 +694,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct dentry *d;
int err;
struct fuse_forget_link *forget;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
if (fuse_is_bad(dir))
return -EIO;
@@ -633,7 +709,22 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
args->out_numargs = 1;
args->out_args[0].size = sizeof(outarg);
args->out_args[0].value = &outarg;
+
+ if (fm->fc->init_security && args->opcode != FUSE_LINK) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ BUG_ON(args->in_numargs != 2);
+
+ args->in_numargs = 3;
+ args->in_args[2].size = security_ctxlen;
+ args->in_args[2].value = security_ctx;
+ }
+
err = fuse_simple_request(fm, args);
+ kfree(security_ctx);
if (err)
goto out_put_forget_req;
@@ -738,14 +829,51 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
-void fuse_update_ctime(struct inode *inode)
+void fuse_flush_time_update(struct inode *inode)
+{
+ int err = sync_inode_metadata(inode, 1);
+
+ mapping_set_error(inode->i_mapping, err);
+}
+
+static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
+ fuse_flush_time_update(inode);
}
}
+void fuse_update_ctime(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_CTIME);
+ fuse_update_ctime_in_cache(inode);
+}
+
+static void fuse_entry_unlinked(struct dentry *entry)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fi->lock);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -762,24 +890,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- struct inode *inode = d_inode(entry);
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- /*
- * If i_nlink == 0 then unlink doesn't make sense, yet this can
- * happen if userspace filesystem is careless. It would be
- * difficult to enforce correct nlink usage so just ignore this
- * condition here
- */
- if (inode->i_nlink > 0)
- drop_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
- fuse_update_ctime(inode);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -801,9 +913,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -833,24 +944,18 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
- fuse_invalidate_attr(d_inode(oldent));
fuse_update_ctime(d_inode(oldent));
- if (flags & RENAME_EXCHANGE) {
- fuse_invalidate_attr(d_inode(newent));
+ if (flags & RENAME_EXCHANGE)
fuse_update_ctime(d_inode(newent));
- }
fuse_dir_changed(olddir);
if (olddir != newdir)
fuse_dir_changed(newdir);
/* newent will end up negative */
- if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
- fuse_invalidate_attr(d_inode(newent));
- fuse_invalidate_entry_cache(newent);
- fuse_update_ctime(d_inode(newent));
- }
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
+ fuse_entry_unlinked(newent);
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -916,25 +1021,11 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
- /* Contrary to "normal" filesystems it can happen that link
- makes two "logical" inodes point to the same "physical"
- inode. We invalidate the attributes of the old one, so it
- will reflect changes in the backing inode (link count,
- etc.)
- */
- if (!err) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- if (likely(inode->i_nlink < UINT_MAX))
- inc_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
- } else if (err == -EINTR) {
+ if (!err)
+ fuse_update_ctime_in_cache(inode);
+ else if (err == -EINTR)
fuse_invalidate_attr(inode);
- }
+
return err;
}
@@ -944,15 +1035,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
- /* see the comment in fuse_change_attributes() */
- if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
- attr->size = i_size_read(inode);
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
- }
-
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -1030,12 +1112,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
bool sync;
+ u32 inval_mask = READ_ONCE(fi->inval_mask);
+ u32 cache_mask = fuse_get_cache_mask(inode);
if (flags & AT_STATX_FORCE_SYNC)
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
- else if (request_mask & READ_ONCE(fi->inval_mask))
+ else if (request_mask & inval_mask & ~cache_mask)
sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -1052,11 +1136,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
return err;
}
-int fuse_update_attributes(struct inode *inode, struct file *file)
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- /* Do *not* need to get atime for internal purposes */
- return fuse_update_get_attr(inode, file, NULL,
- STATX_BASIC_STATS & ~STATX_ATIME, 0);
+ return fuse_update_get_attr(inode, file, NULL, mask, 0);
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1071,7 +1153,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock(parent);
+ inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -1561,10 +1643,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
- bool is_wb = fc->writeback_cache;
+ bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode);
loff_t oldsize;
int err;
- bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool trust_local_cmtime = is_wb;
bool fault_blocked = false;
if (!fc->default_permissions)
@@ -1608,7 +1690,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
/* Flush dirty data/metadata before non-truncate SETATTR */
- if (is_wb && S_ISREG(inode->i_mode) &&
+ if (is_wb &&
attr->ia_valid &
(ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
ATTR_TIMES_SET)) {
@@ -1676,10 +1758,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ attr_timeout(&outarg),
+ fuse_get_cache_mask(inode));
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
- if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ if (!is_wb || is_truncate)
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
@@ -1690,7 +1773,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/*
* Only call invalidate_inode_pages2() after removing
- * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock.
*/
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 11404f8c21c7..f18d14d5fea1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -211,9 +211,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
spin_unlock(&fi->lock);
truncate_pagecache(inode, 0);
- fuse_invalidate_attr(inode);
- if (fc->writeback_cache)
- file_update_time(file);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
invalidate_inode_pages2(inode->i_mapping);
}
@@ -339,12 +338,6 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- /* see fuse_vma_close() for !writeback_cache case */
- if (fc->writeback_cache)
- write_inode_now(inode, 1);
-
fuse_release_common(file, false);
/* return value is ignored by VFS */
@@ -483,6 +476,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fuse_is_bad(inode))
return -EIO;
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
+
err = write_inode_now(inode, 1);
if (err)
return err;
@@ -521,7 +517,7 @@ inval_attr_out:
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fm->fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
return err;
}
@@ -687,7 +683,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
spin_unlock(&fi->lock);
}
- io->iocb->ki_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
@@ -793,7 +789,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size &&
+ if (attr_ver >= fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
@@ -970,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac)
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ break;
+
nr_pages = readahead_count(rac) - nr_pages;
if (nr_pages > max_pages)
nr_pages = max_pages;
@@ -1003,7 +1007,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
- err = fuse_update_attributes(inode, iocb->ki_filp);
+ err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
if (err)
return err;
}
@@ -1072,7 +1076,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
return err ?: ia->write.out.size;
}
-bool fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1080,12 +1084,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
- if (pos > inode->i_size) {
+ if (written > 0 && pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
spin_unlock(&fi->lock);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
return ret;
}
@@ -1164,7 +1170,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
again:
err = -EFAULT;
- if (iov_iter_fault_in_readable(ii, bytes))
+ if (fault_in_iov_iter_readable(ii, bytes))
break;
err = -ENOMEM;
@@ -1268,11 +1274,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
kfree(ap->pages);
} while (!err && iov_iter_count(ii));
- if (res > 0)
- fuse_write_update_size(inode, pos);
-
+ fuse_write_update_attr(inode, pos, res);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- fuse_invalidate_attr(inode);
return res > 0 ? res : err;
}
@@ -1290,7 +1293,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
- err = fuse_update_attributes(mapping->host, file);
+ err = fuse_update_attributes(mapping->host, file,
+ STATX_SIZE | STATX_MODE);
if (err)
return err;
@@ -1417,6 +1421,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
else
@@ -1451,7 +1456,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- ia->io = io;
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
@@ -1561,11 +1565,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
} else {
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
+ fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- fuse_invalidate_attr(inode);
- if (res > 0)
- fuse_write_update_size(inode, iocb->ki_pos);
inode_unlock(inode);
return res;
@@ -1776,7 +1778,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
* is enabled, we trust local ctime/mtime.
*/
if (!fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
@@ -1822,14 +1824,13 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = NULL;
+ struct fuse_file *ff;
spin_lock(&fi->lock);
- if (!list_empty(&fi->write_files)) {
- ff = list_entry(fi->write_files.next, struct fuse_file,
- write_entry);
+ ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
+ write_entry);
+ if (ff)
fuse_file_get(ff);
- }
spin_unlock(&fi->lock);
return ff;
@@ -1848,6 +1849,17 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
+ /*
+ * Inode is always written before the last reference is dropped and
+ * hence this should not be reached from reclaim.
+ *
+ * Writing back the inode from reclaim can deadlock if the request
+ * processing itself needs an allocation. Allocations triggering
+ * reclaim while serving a request can't be prevented, because it can
+ * involve any number of unrelated userspace processes.
+ */
+ WARN_ON(wbc->for_reclaim);
+
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -1955,6 +1967,7 @@ err:
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
int err;
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
@@ -1970,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
err = fuse_writepage_locked(page);
unlock_page(page);
@@ -2223,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping,
if (fuse_is_bad(inode))
goto out;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
+
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;
@@ -2306,15 +2327,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!copied)
goto unlock;
+ pos += copied;
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_MASK;
+ size_t endoff = pos & ~PAGE_MASK;
if (endoff)
zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
- fuse_write_update_size(inode, pos + copied);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+
set_page_dirty(page);
unlock:
@@ -2324,28 +2348,31 @@ unlock:
return copied;
}
-static int fuse_launder_page(struct page *page)
+static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (clear_page_dirty_for_io(page)) {
- struct inode *inode = page->mapping->host;
+ if (folio_clear_dirty_for_io(folio)) {
+ struct inode *inode = folio->mapping->host;
/* Serialize with pending writeback for the same page */
- fuse_wait_on_page_writeback(inode, page->index);
- err = fuse_writepage_locked(page);
+ fuse_wait_on_page_writeback(inode, folio->index);
+ err = fuse_writepage_locked(&folio->page);
if (!err)
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_page_writeback(inode, folio->index);
}
return err;
}
/*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
*/
static void fuse_vma_close(struct vm_area_struct *vma)
{
- filemap_write_and_wait(vma->vm_file->f_mapping);
+ int err;
+
+ err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+ mapping_set_error(vma->vm_file->f_mapping, err);
}
/*
@@ -2628,7 +2655,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
- err = fuse_update_attributes(inode, file);
+ err = fuse_update_attributes(inode, file, STATX_SIZE);
if (!err)
return generic_file_llseek(file, offset, whence);
else
@@ -2648,7 +2675,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
break;
case SEEK_END:
inode_lock(inode);
- retval = fuse_update_attributes(inode, file);
+ retval = fuse_update_attributes(inode, file, STATX_SIZE);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
@@ -2869,7 +2896,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
@@ -2891,9 +2918,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
- if (ret > 0)
- fuse_write_update_size(inode, pos);
- else if (ret < 0 && offset + count > i_size)
+ fuse_write_update_attr(inode, pos, ret);
+ if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
@@ -2902,7 +2928,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
- int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
if (!err)
fuse_sync_writes(inode);
@@ -2981,16 +3007,14 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
/* we could have extended the file */
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- bool changed = fuse_write_update_size(inode, offset + length);
-
- if (changed && fm->fc->writeback_cache)
+ if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
@@ -3002,6 +3026,8 @@ out:
if (lock_inode)
inode_unlock(inode);
+ fuse_flush_time_update(inode);
+
return err;
}
@@ -3096,12 +3122,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
- if (fc->writeback_cache) {
- fuse_write_update_size(inode_out, pos_out + outarg.size);
- file_update_time(file_out);
- }
-
- fuse_invalidate_attr(inode_out);
+ file_update_time(file_out);
+ fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
err = outarg.size;
out:
@@ -3111,6 +3133,8 @@ out:
inode_unlock(inode_out);
file_accessed(file_in);
+ fuse_flush_time_update(inode_out);
+
return err;
}
@@ -3155,15 +3179,15 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
- .launder_page = fuse_launder_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .launder_folio = fuse_launder_folio,
+ .dirty_folio = filemap_dirty_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
.write_end = fuse_write_end,
};
-void fuse_init_file_inode(struct inode *inode)
+void fuse_init_file_inode(struct inode *inode, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -3177,5 +3201,5 @@ void fuse_init_file_inode(struct inode *inode)
fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
- fuse_dax_inode_init(inode);
+ fuse_dax_inode_init(inode, flags);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f55f9f94b1a4..488b460e046f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,6 +256,7 @@ struct fuse_args {
bool nocreds:1;
bool in_pages:1;
bool out_pages:1;
+ bool user_pages:1;
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
@@ -480,6 +481,18 @@ struct fuse_dev {
struct list_head entry;
};
+enum fuse_dax_mode {
+ FUSE_DAX_INODE_DEFAULT, /* default */
+ FUSE_DAX_ALWAYS, /* "-o dax=always" */
+ FUSE_DAX_NEVER, /* "-o dax=never" */
+ FUSE_DAX_INODE_USER, /* "-o dax=inode" */
+};
+
+static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode)
+{
+ return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER;
+}
+
struct fuse_fs_context {
int fd;
struct file *file;
@@ -497,7 +510,7 @@ struct fuse_fs_context {
bool no_control:1;
bool no_force_umount:1;
bool legacy_opts_show:1;
- bool dax:1;
+ enum fuse_dax_mode dax_mode;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
@@ -614,7 +627,7 @@ struct fuse_conn {
/** Connection successful. Only set in INIT */
unsigned conn_init:1;
- /** Do readpages asynchronously? Only set in INIT */
+ /** Do readahead asynchronously? Only set in INIT */
unsigned async_read:1;
/** Return an unique read error after abort. Only set in INIT */
@@ -765,6 +778,12 @@ struct fuse_conn {
/* Propagate syncfs() to server */
unsigned int sync_fs:1;
+ /* Initialize security xattrs when creating a new inode */
+ unsigned int init_security:1;
+
+ /* Does the filesystem support per inode DAX? */
+ unsigned int inode_dax:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -802,6 +821,9 @@ struct fuse_conn {
struct list_head devices;
#ifdef CONFIG_FUSE_DAX
+ /* Dax mode */
+ enum fuse_dax_mode dax_mode;
+
/* Dax specific conn data, non-NULL if DAX is enabled */
struct fuse_conn_dax *dax;
#endif
@@ -1007,7 +1029,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
/**
* Initialize file operations on a regular file
*/
-void fuse_init_file_inode(struct inode *inode);
+void fuse_init_file_inode(struct inode *inode, unsigned int flags);
/**
* Initialize inode operations on regular files and special files
@@ -1031,7 +1053,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+ u64 attr_valid, u32 cache_mask);
+
+u32 fuse_get_cache_mask(struct inode *inode);
/**
* Initialize the client device
@@ -1065,7 +1089,15 @@ void fuse_wait_aborted(struct fuse_conn *fc);
/**
* Invalidate inode attributes
*/
+
+/* Attributes possibly changed on data modification */
+#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS)
+
+/* Attributes possibly changed on data and/or size modification */
+#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE)
+
void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask);
void fuse_invalidate_entry_cache(struct dentry *entry);
@@ -1148,9 +1180,10 @@ int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+void fuse_flush_time_update(struct inode *inode);
void fuse_update_ctime(struct inode *inode);
-int fuse_update_attributes(struct inode *inode, struct file *file);
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask);
void fuse_flush_writepages(struct inode *inode);
@@ -1208,7 +1241,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
__poll_t fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-bool fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
@@ -1258,11 +1291,13 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
-int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode,
+ struct dax_device *dax_dev);
void fuse_dax_conn_free(struct fuse_conn *fc);
bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
-void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags);
void fuse_dax_inode_cleanup(struct inode *inode);
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags);
bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
void fuse_dax_cancel_work(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 12d49a1914e8..8c0665c5dff8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh,
"Global limit for the maximum congestion threshold an "
"unprivileged user can set");
-#define FUSE_SUPER_MAGIC 0x65735546
-
#define FUSE_DEFAULT_BLKSIZE 512
/** Maximum number of outstanding background requests */
@@ -73,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
{
struct fuse_inode *fi;
- fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+ fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
if (!fi)
return NULL;
@@ -118,6 +117,9 @@ static void fuse_evict_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ /* Will write inode on close/munmap and in all other dirtiers */
+ WARN_ON(inode->i_state & I_DIRTY_INODE);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -161,7 +163,7 @@ static ino_t fuse_squash_ino(u64 ino64)
}
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+ u64 attr_valid, u32 cache_mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -181,9 +183,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
- if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ if (!(cache_mask & STATX_MTIME)) {
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ }
+ if (!(cache_mask & STATX_CTIME)) {
inode->i_ctime.tv_sec = attr->ctime;
inode->i_ctime.tv_nsec = attr->ctimensec;
}
@@ -215,16 +219,44 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_flags &= ~S_NOSEC;
}
+u32 fuse_get_cache_mask(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ return 0;
+
+ return STATX_MTIME | STATX_CTIME | STATX_SIZE;
+}
+
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- bool is_wb = fc->writeback_cache;
+ u32 cache_mask;
loff_t oldsize;
struct timespec64 old_mtime;
spin_lock(&fi->lock);
+ /*
+ * In case of writeback_cache enabled, writes update mtime, ctime and
+ * may update i_size. In these cases trust the cached value in the
+ * inode.
+ */
+ cache_mask = fuse_get_cache_mask(inode);
+ if (cache_mask & STATX_SIZE)
+ attr->size = i_size_read(inode);
+
+ if (cache_mask & STATX_MTIME) {
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ }
+ if (cache_mask & STATX_CTIME) {
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fi->lock);
@@ -232,7 +264,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
oldsize = inode->i_size;
/*
@@ -240,11 +272,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
* extend local i_size without keeping userspace server in sync. So,
* attr->size coming from server can be stale. We cannot trust it.
*/
- if (!is_wb || !S_ISREG(inode->i_mode))
+ if (!(cache_mask & STATX_SIZE))
i_size_write(inode, attr->size);
spin_unlock(&fi->lock);
- if (!is_wb && S_ISREG(inode->i_mode)) {
+ if (!cache_mask && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
@@ -268,6 +300,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if (inval)
invalidate_inode_pages2(inode->i_mapping);
}
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_dontcache(inode, attr->flags);
}
static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
@@ -280,7 +315,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
inode->i_ctime.tv_nsec = attr->ctimensec;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
- fuse_init_file_inode(inode);
+ fuse_init_file_inode(inode, attr->flags);
} else if (S_ISDIR(inode->i_mode))
fuse_init_dir(inode);
else if (S_ISLNK(inode->i_mode))
@@ -734,8 +769,12 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",blksize=%lu", sb->s_blocksize);
}
#ifdef CONFIG_FUSE_DAX
- if (fc->dax)
- seq_puts(m, ",dax");
+ if (fc->dax_mode == FUSE_DAX_ALWAYS)
+ seq_puts(m, ",dax=always");
+ else if (fc->dax_mode == FUSE_DAX_NEVER)
+ seq_puts(m, ",dax=never");
+ else if (fc->dax_mode == FUSE_DAX_INODE_USER)
+ seq_puts(m, ",dax=inode");
#endif
return 0;
@@ -1076,73 +1115,80 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
process_init_limits(fc, arg);
if (arg->minor >= 6) {
+ u64 flags = arg->flags | (u64) arg->flags2 << 32;
+
ra_pages = arg->max_readahead / PAGE_SIZE;
- if (arg->flags & FUSE_ASYNC_READ)
+ if (flags & FUSE_ASYNC_READ)
fc->async_read = 1;
- if (!(arg->flags & FUSE_POSIX_LOCKS))
+ if (!(flags & FUSE_POSIX_LOCKS))
fc->no_lock = 1;
if (arg->minor >= 17) {
- if (!(arg->flags & FUSE_FLOCK_LOCKS))
+ if (!(flags & FUSE_FLOCK_LOCKS))
fc->no_flock = 1;
} else {
- if (!(arg->flags & FUSE_POSIX_LOCKS))
+ if (!(flags & FUSE_POSIX_LOCKS))
fc->no_flock = 1;
}
- if (arg->flags & FUSE_ATOMIC_O_TRUNC)
+ if (flags & FUSE_ATOMIC_O_TRUNC)
fc->atomic_o_trunc = 1;
if (arg->minor >= 9) {
/* LOOKUP has dependency on proto version */
- if (arg->flags & FUSE_EXPORT_SUPPORT)
+ if (flags & FUSE_EXPORT_SUPPORT)
fc->export_support = 1;
}
- if (arg->flags & FUSE_BIG_WRITES)
+ if (flags & FUSE_BIG_WRITES)
fc->big_writes = 1;
- if (arg->flags & FUSE_DONT_MASK)
+ if (flags & FUSE_DONT_MASK)
fc->dont_mask = 1;
- if (arg->flags & FUSE_AUTO_INVAL_DATA)
+ if (flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
- else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA)
+ else if (flags & FUSE_EXPLICIT_INVAL_DATA)
fc->explicit_inval_data = 1;
- if (arg->flags & FUSE_DO_READDIRPLUS) {
+ if (flags & FUSE_DO_READDIRPLUS) {
fc->do_readdirplus = 1;
- if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ if (flags & FUSE_READDIRPLUS_AUTO)
fc->readdirplus_auto = 1;
}
- if (arg->flags & FUSE_ASYNC_DIO)
+ if (flags & FUSE_ASYNC_DIO)
fc->async_dio = 1;
- if (arg->flags & FUSE_WRITEBACK_CACHE)
+ if (flags & FUSE_WRITEBACK_CACHE)
fc->writeback_cache = 1;
- if (arg->flags & FUSE_PARALLEL_DIROPS)
+ if (flags & FUSE_PARALLEL_DIROPS)
fc->parallel_dirops = 1;
- if (arg->flags & FUSE_HANDLE_KILLPRIV)
+ if (flags & FUSE_HANDLE_KILLPRIV)
fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
fm->sb->s_time_gran = arg->time_gran;
- if ((arg->flags & FUSE_POSIX_ACL)) {
+ if ((flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
- if (arg->flags & FUSE_CACHE_SYMLINKS)
+ if (flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
- if (arg->flags & FUSE_ABORT_ERROR)
+ if (flags & FUSE_ABORT_ERROR)
fc->abort_err = 1;
- if (arg->flags & FUSE_MAX_PAGES) {
+ if (flags & FUSE_MAX_PAGES) {
fc->max_pages =
min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
}
- if (IS_ENABLED(CONFIG_FUSE_DAX) &&
- arg->flags & FUSE_MAP_ALIGNMENT &&
- !fuse_dax_check_alignment(fc, arg->map_alignment)) {
- ok = false;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ if (flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
+ if (flags & FUSE_HAS_INODE_DAX)
+ fc->inode_dax = 1;
}
- if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
+ if (flags & FUSE_HANDLE_KILLPRIV_V2) {
fc->handle_killpriv_v2 = 1;
fm->sb->s_flags |= SB_NOSEC;
}
- if (arg->flags & FUSE_SETXATTR_EXT)
+ if (flags & FUSE_SETXATTR_EXT)
fc->setxattr_ext = 1;
+ if (flags & FUSE_SECURITY_CTX)
+ fc->init_security = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1170,13 +1216,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
void fuse_send_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
+ u64 flags;
ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
ia->in.major = FUSE_KERNEL_VERSION;
ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
- ia->in.flags |=
+ flags =
FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -1186,13 +1233,19 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
- FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT;
+ FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
+ FUSE_SECURITY_CTX;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
- ia->in.flags |= FUSE_MAP_ALIGNMENT;
+ flags |= FUSE_MAP_ALIGNMENT;
+ if (fuse_is_inode_dax_mode(fm->fc->dax_mode))
+ flags |= FUSE_HAS_INODE_DAX;
#endif
if (fm->fc->auto_submounts)
- ia->in.flags |= FUSE_SUBMOUNTS;
+ flags |= FUSE_SUBMOUNTS;
+
+ ia->in.flags = flags;
+ ia->in.flags2 = flags >> 32;
ia->args.opcode = FUSE_INIT;
ia->args.in_numargs = 1;
@@ -1481,7 +1534,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
sb->s_subtype = ctx->subtype;
ctx->subtype = NULL;
if (IS_ENABLED(CONFIG_FUSE_DAX)) {
- err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
+ err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev);
if (err)
goto err;
}
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 546ea3d58fb4..33cde4bbccdc 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -170,7 +170,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
#else
if (flags & FUSE_IOCTL_COMPAT) {
inarg.flags |= FUSE_IOCTL_32BIT;
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall())
inarg.flags |= FUSE_IOCTL_COMPAT_X32;
#endif
@@ -286,11 +286,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- vaddr = kmap_atomic(ap.pages[0]);
+ vaddr = kmap_local_page(ap.pages[0]);
err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
if (err)
goto out;
@@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].value = ptr;
err = fuse_simple_request(fm, &args);
- if (!err && outarg.flags & FUSE_IOCTL_RETRY)
- err = -EIO;
-
+ if (!err) {
+ if (outarg.result < 0)
+ err = outarg.result;
+ else if (outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+ }
return err;
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index bc267832310c..b4e565711045 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -76,11 +76,11 @@ static void fuse_add_dirent_to_cache(struct file *file,
WARN_ON(fi->rdc.pos != pos))
goto unlock;
- addr = kmap_atomic(page);
+ addr = kmap_local_page(page);
if (!offset)
clear_page(addr);
memcpy(addr + offset, dirent, reclen);
- kunmap_atomic(addr);
+ kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
fi->rdc.pos = dirent->off;
unlock:
@@ -454,7 +454,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
* cache; both cases require an up-to-date mtime value.
*/
if (!ctx->pos && fc->auto_inval_data) {
- int err = fuse_update_attributes(inode, file);
+ int err = fuse_update_attributes(inode, file, STATX_MTIME);
if (err)
return err;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 94fc874f5de7..86b7dbb6a0d4 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -8,6 +8,7 @@
#include <linux/dax.h>
#include <linux/pci.h>
#include <linux/pfn_t.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
@@ -88,12 +89,21 @@ struct virtio_fs_req_work {
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
+static const struct constant_table dax_param_enums[] = {
+ {"always", FUSE_DAX_ALWAYS },
+ {"never", FUSE_DAX_NEVER },
+ {"inode", FUSE_DAX_INODE_USER },
+ {}
+};
+
enum {
OPT_DAX,
+ OPT_DAX_ENUM,
};
static const struct fs_parameter_spec virtio_fs_parameters[] = {
fsparam_flag("dax", OPT_DAX),
+ fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums),
{}
};
@@ -110,7 +120,10 @@ static int virtio_fs_parse_param(struct fs_context *fsc,
switch (opt) {
case OPT_DAX:
- ctx->dax = 1;
+ ctx->dax_mode = FUSE_DAX_ALWAYS;
+ break;
+ case OPT_DAX_ENUM:
+ ctx->dax_mode = result.uint_32;
break;
default:
return -EINVAL;
@@ -649,7 +662,7 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
int vq_type)
{
- strncpy(fsvq->name, name, VQ_NAME_LEN);
+ strscpy(fsvq->name, name, VQ_NAME_LEN);
spin_lock_init(&fsvq->lock);
INIT_LIST_HEAD(&fsvq->queued_reqs);
INIT_LIST_HEAD(&fsvq->end_reqs);
@@ -753,20 +766,6 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
-static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
- pgoff_t pgoff, void *addr,
- size_t bytes, struct iov_iter *i)
-{
- return copy_from_iter(addr, bytes, i);
-}
-
-static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
- pgoff_t pgoff, void *addr,
- size_t bytes, struct iov_iter *i)
-{
- return copy_to_iter(addr, bytes, i);
-}
-
static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
pgoff_t pgoff, size_t nr_pages)
{
@@ -783,8 +782,6 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
static const struct dax_operations virtio_fs_dax_ops = {
.direct_access = virtio_fs_direct_access,
- .copy_from_iter = virtio_fs_copy_from_iter,
- .copy_to_iter = virtio_fs_copy_to_iter,
.zero_page_range = virtio_fs_zero_page_range,
};
@@ -850,7 +847,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
- fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+ fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
if (IS_ERR(fs->dax_dev))
return PTR_ERR(fs->dax_dev);
@@ -895,7 +892,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
return 0;
out_vqs:
- vdev->config->reset(vdev);
+ virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
kfree(fs->vqs);
@@ -927,7 +924,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
list_del_init(&fs->list);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
- vdev->config->reset(vdev);
+ virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
vdev->priv = NULL;
@@ -1326,8 +1323,8 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
/* virtiofs allocates and installs its own fuse devices */
ctx->fudptr = NULL;
- if (ctx->dax) {
- if (!fs->dax_dev) {
+ if (ctx->dax_mode != FUSE_DAX_NEVER) {
+ if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) {
err = -EINVAL;
pr_err("virtio-fs: dax can't be enabled as filesystem"
" device does not support it.\n");
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 61dfaf7b7d20..0d3e7177fce0 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -42,10 +42,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
@@ -173,10 +172,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 005e920f5d4a..72c9f31ce724 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -606,18 +606,12 @@ out:
gfs2_trans_end(sdp);
}
-/**
- * jdata_set_page_dirty - Page dirtying function
- * @page: The page to dirty
- *
- * Returns: 1 if it dirtyed the page, or 0 otherwise
- */
-
-static int jdata_set_page_dirty(struct page *page)
+static bool jdata_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
if (current->journal_info)
- SetPageChecked(page);
- return __set_page_dirty_buffers(page);
+ folio_set_checked(folio);
+ return block_dirty_folio(mapping, folio);
}
/**
@@ -672,22 +666,23 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void gfs2_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
- unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host);
+ size_t stop = offset + length;
+ int partial_page = (offset || length < folio_size(folio));
struct buffer_head *bh, *head;
unsigned long pos = 0;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
- if (!page_has_buffers(page))
+ folio_clear_checked(folio);
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- bh = head = page_buffers(page);
+ bh = head;
do {
if (pos + bh->b_size > stop)
return;
@@ -699,7 +694,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
out:
if (!partial_page)
- try_to_release_page(page, 0);
+ filemap_release_folio(folio, 0);
}
/**
@@ -779,9 +774,9 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -794,9 +789,9 @@ static const struct address_space_operations gfs2_jdata_aops = {
.writepages = gfs2_jdata_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = jdata_set_page_dirty,
+ .dirty_folio = jdata_dirty_folio,
.bmap = gfs2_bmap,
- .invalidatepage = gfs2_invalidatepage,
+ .invalidate_folio = gfs2_invalidate_folio,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5414c2c33580..b6697333bb2b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -606,9 +606,9 @@ out:
return ret;
}
-static inline __be64 *gfs2_indirect_init(struct metapath *mp,
- struct gfs2_glock *gl, unsigned int i,
- unsigned offset, u64 bn)
+static inline void gfs2_indirect_init(struct metapath *mp,
+ struct gfs2_glock *gl, unsigned int i,
+ unsigned offset, u64 bn)
{
__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
((i > 1) ? sizeof(struct gfs2_meta_header) :
@@ -621,7 +621,6 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
ptr += offset;
*ptr = cpu_to_be64(bn);
- return ptr;
}
enum alloc_state {
@@ -940,7 +939,7 @@ do_alloc:
else if (height == ip->i_height)
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
else
- iomap->length = size - pos;
+ iomap->length = size - iomap->offset;
} else if (flags & IOMAP_WRITE) {
u64 alloc_size;
@@ -961,46 +960,6 @@ hole_found:
goto out;
}
-static int gfs2_write_lock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
- error = gfs2_glock_nq(&ip->i_gh);
- if (error)
- goto out_uninit;
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
- GL_NOCACHE, &m_ip->i_gh);
- if (error)
- goto out_unlock;
- }
- return 0;
-
-out_unlock:
- gfs2_glock_dq(&ip->i_gh);
-out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
- return error;
-}
-
-static void gfs2_write_unlock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
-
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- gfs2_glock_dq_uninit(&m_ip->i_gh);
- }
- gfs2_glock_dq_uninit(&ip->i_gh);
-}
-
static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
unsigned len)
{
@@ -1118,11 +1077,6 @@ out_qunlock:
return ret;
}
-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
-{
- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
-}
-
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap,
struct iomap *srcmap)
@@ -1135,12 +1089,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
iomap->flags |= IOMAP_F_BUFFER_HEAD;
trace_gfs2_iomap_start(ip, pos, length, flags);
- if (gfs2_iomap_need_write_lock(flags)) {
- ret = gfs2_write_lock(inode);
- if (ret)
- goto out;
- }
-
ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
if (ret)
goto out_unlock;
@@ -1168,10 +1116,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
out_unlock:
- if (ret && gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
release_metapath(&mp);
-out:
trace_gfs2_iomap_end(ip, iomap, ret);
return ret;
}
@@ -1208,26 +1153,21 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
- loff_t blockmask = i_blocksize(inode) - 1;
- loff_t end = (pos + length) & ~blockmask;
+ loff_t hstart = round_up(pos + written, i_blocksize(inode));
+ loff_t hend = iomap->offset + iomap->length;
- pos = (pos + written + blockmask) & ~blockmask;
- if (pos < end) {
- truncate_pagecache_range(inode, pos, end - 1);
- punch_hole(ip, pos, end - pos);
+ if (hstart < hend) {
+ truncate_pagecache_range(inode, hstart, hend - 1);
+ punch_hole(ip, hstart, hend - hstart);
}
}
if (unlikely(!written))
- goto out_unlock;
+ return 0;
if (iomap->flags & IOMAP_F_SIZE_CHANGED)
mark_inode_dirty(inode);
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-
-out_unlock:
- if (gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
return 0;
}
@@ -2204,7 +2144,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
ret = do_shrink(inode, newsize);
out:
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c559827cb6f9..2556ae1f92ea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -213,11 +213,9 @@ void gfs2_set_inode_flags(struct inode *inode)
* @inode: The inode
* @reqflags: The flags to set
* @mask: Indicates which flags are valid
- * @fsflags: The FS_* inode flags passed in
*
*/
-static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
- const u32 fsflags)
+static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -236,11 +234,6 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
if ((new_flags ^ flags) == 0)
goto out;
- error = -EPERM;
- if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
- goto out;
- if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
- goto out;
if (!IS_IMMUTABLE(inode)) {
error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
if (error)
@@ -313,7 +306,7 @@ int gfs2_fileattr_set(struct user_namespace *mnt_userns,
mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
}
- return do_gfs2_set_flags(inode, gfsflags, mask, fsflags);
+ return do_gfs2_set_flags(inode, gfsflags, mask);
}
static int gfs2_getlabel(struct file *filp, char __user *label)
@@ -711,10 +704,11 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if (gfs2_rs_active(&ip->i_res))
- gfs2_rs_delete(ip, &inode->i_writecount);
- if (file->f_mode & FMODE_WRITE)
+ if (file->f_mode & FMODE_WRITE) {
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
+ }
return 0;
}
@@ -776,27 +770,93 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
+static inline bool should_fault_in_pages(struct iov_iter *i,
+ struct kiocb *iocb,
+ size_t *prev_count,
+ size_t *window_size)
+{
+ size_t count = iov_iter_count(i);
+ size_t size, offs;
+
+ if (!count)
+ return false;
+ if (!iter_is_iovec(i))
+ return false;
+
+ size = PAGE_SIZE;
+ offs = offset_in_page(iocb->ki_pos);
+ if (*prev_count != count || !*window_size) {
+ size_t nr_dirtied;
+
+ nr_dirtied = max(current->nr_dirtied_pause -
+ current->nr_dirtied, 8);
+ size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT);
+ }
+
+ *prev_count = count;
+ *window_size = size - offs;
+ return true;
+}
+
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- size_t count = iov_iter_count(to);
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
ssize_t ret;
- if (!count)
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
+ * physical as well as manual page faults, and we need to disable both
+ * kinds.
+ *
+ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
+ * locking mode is compatible with other deferred holders, so multiple
+ * processes and nodes can do direct I/O to a file at the same time.
+ * There's no guarantee that reads or writes will be atomic. Any
+ * coordination among readers and writers needs to happen externally.
+ */
+
+ if (!iov_iter_count(to))
return 0; /* skip atime */
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, read);
+ to->nofault = false;
+ pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+ if (ret > 0)
+ read = ret;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
- gfs2_glock_dq(gh);
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
+ goto retry;
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return read;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -805,11 +865,21 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- size_t len = iov_iter_count(from);
- loff_t offset = iocb->ki_pos;
+ size_t prev_count = 0, window_size = 0;
+ size_t written = 0;
ssize_t ret;
/*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * For writes, iomap_dio_rw only triggers manual page faults, so we
+ * don't need to disable physical ones.
+ */
+
+ /*
* Deferred lock, even if its a write, since we do no allocation on
* this path. All we need to change is the atime, and this lock mode
* ensures that other nodes have flushed their buffered read caches
@@ -818,45 +888,71 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* VFS does.
*/
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-
/* Silently fall back to buffered I/O when writing beyond EOF */
- if (offset + len > i_size_read(&ip->i_inode))
- goto out;
+ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
+ goto out_unlock;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
- if (ret == -ENOTBLK)
- ret = 0;
-out:
- gfs2_glock_dq(gh);
+ from->nofault = true;
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, written);
+ from->nofault = false;
+ if (ret <= 0) {
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EFAULT)
+ goto out_unlock;
+ }
+ if (ret > 0)
+ written = ret;
+
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (window_size)
+ goto retry;
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return written;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct gfs2_inode *ip;
struct gfs2_holder gh;
- size_t written = 0;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
ssize_t ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = gfs2_file_direct_read(iocb, to, &gh);
- if (likely(ret != -ENOTBLK))
- return ret;
- iocb->ki_flags &= ~IOCB_DIRECT;
- }
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return gfs2_file_direct_read(iocb, to, &gh);
+
+ pagefault_disable();
iocb->ki_flags |= IOCB_NOIO;
ret = generic_file_read_iter(iocb, to);
iocb->ki_flags &= ~IOCB_NOIO;
+ pagefault_enable();
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
- written = ret;
- } else {
+ read = ret;
+ } else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -864,15 +960,111 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
+ pagefault_disable();
ret = generic_file_read_iter(iocb, to);
+ pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
if (ret > 0)
- written += ret;
- gfs2_glock_dq(&gh);
+ read += ret;
+
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(&gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
+ goto retry;
+ }
+out_unlock:
+ if (gfs2_holder_queued(&gh))
+ gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
+ return read ? read : ret;
+}
+
+static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from,
+ struct gfs2_holder *gh)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder *statfs_gh = NULL;
+ size_t prev_count = 0, window_size = 0;
+ size_t orig_count = iov_iter_count(from);
+ size_t written = 0;
+ ssize_t ret;
+
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
+ if (inode == sdp->sd_rindex) {
+ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
+ if (!statfs_gh)
+ return -ENOMEM;
+ }
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (!window_size) {
+ ret = -EFAULT;
+ goto out_uninit;
+ }
+ from->count = min(from->count, window_size);
+ }
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+
+ if (inode == sdp->sd_rindex) {
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, statfs_gh);
+ if (ret)
+ goto out_unlock;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ pagefault_disable();
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ pagefault_enable();
+ current->backing_dev_info = NULL;
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ written += ret;
+ }
+
+ if (inode == sdp->sd_rindex)
+ gfs2_glock_dq_uninit(statfs_gh);
+
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+
+ from->count = orig_count - written;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ goto retry;
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
+out_uninit:
+ gfs2_holder_uninit(gh);
+ if (statfs_gh)
+ kfree(statfs_gh);
+ from->count = orig_count - written;
return written ? written : ret;
}
@@ -927,9 +1119,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out_unlock;
iocb->ki_flags |= IOCB_DSYNC;
- current->backing_dev_info = inode_to_bdi(inode);
- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
+ buffered = gfs2_file_buffered_write(iocb, from, &gh);
if (unlikely(buffered <= 0)) {
if (!ret)
ret = buffered;
@@ -943,7 +1133,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* the direct I/O range as we don't know if the buffered pages
* made it to disk.
*/
- iocb->ki_pos += buffered;
ret2 = generic_write_sync(iocb, buffered);
invalidate_mapping_pages(mapping,
(iocb->ki_pos - buffered) >> PAGE_SHIFT,
@@ -951,13 +1140,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
- current->backing_dev_info = inode_to_bdi(inode);
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
+ ret = gfs2_file_buffered_write(iocb, from, &gh);
+ if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
- }
}
out_unlock:
@@ -1294,7 +1479,6 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (error != GLR_TRYFAILED)
break;
fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
- fl_gh->gh_error = 0;
msleep(sleeptime);
}
if (error) {
@@ -1338,8 +1522,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
@@ -1353,7 +1535,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
@@ -1386,7 +1568,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e0eaa9cf9fb6..630c6550eacf 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+static void __gfs2_glock_dq(struct gfs2_holder *gh);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
+ /*
+ * Note that demote_ok is used for the lru process of disposing of
+ * glocks. For this purpose, we don't care if the glock's holders
+ * have the HIF_MAY_DEMOTE flag set or not. If someone is using
+ * them, don't demote.
+ */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * may_grant - check if its ok to grant a new lock
+ * may_grant - check if it's ok to grant a new lock
* @gl: The glock
+ * @current_gh: One of the current holders of @gl
* @gh: The lock request which we wish to grant
*
- * Returns: true if its ok to grant the lock
+ * With our current compatibility rules, if a glock has one or more active
+ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
+ * @current_gh; they are all the same as far as compatibility with the new @gh
+ * goes.
+ *
+ * Returns true if it's ok to grant the lock.
*/
-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
-{
- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
+static inline bool may_grant(struct gfs2_glock *gl,
+ struct gfs2_holder *current_gh,
+ struct gfs2_holder *gh)
+{
+ if (current_gh) {
+ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
+
+ switch(current_gh->gh_state) {
+ case LM_ST_EXCLUSIVE:
+ /*
+ * Here we make a special exception to grant holders
+ * who agree to share the EX lock with other holders
+ * who also have the bit set. If the original holder
+ * has the LM_FLAG_NODE_SCOPE bit set, we grant more
+ * holders with the bit set.
+ */
+ return gh->gh_state == LM_ST_EXCLUSIVE &&
+ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
+ (gh->gh_flags & LM_FLAG_NODE_SCOPE);
- if (gh != gh_head) {
- /**
- * Here we make a special exception to grant holders who agree
- * to share the EX lock with other holders who also have the
- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
- * is set, we grant more holders with the bit set.
- */
- if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
- gh->gh_state == LM_ST_EXCLUSIVE &&
- (gh->gh_flags & LM_FLAG_NODE_SCOPE))
- return 1;
- if ((gh->gh_state == LM_ST_EXCLUSIVE ||
- gh_head->gh_state == LM_ST_EXCLUSIVE))
- return 0;
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ return gh->gh_state == current_gh->gh_state;
+
+ default:
+ return false;
+ }
}
+
if (gl->gl_state == gh->gh_state)
- return 1;
+ return true;
if (gh->gh_flags & GL_EXACT)
- return 0;
+ return false;
if (gl->gl_state == LM_ST_EXCLUSIVE) {
- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
- return 1;
- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
- return 1;
+ return gh->gh_state == LM_ST_SHARED ||
+ gh->gh_state == LM_ST_DEFERRED;
}
- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
- return 1;
- return 0;
+ if (gh->gh_flags & LM_FLAG_ANY)
+ return gl->gl_state != LM_ST_UNLOCKED;
+ return false;
}
static void gfs2_holder_wake(struct gfs2_holder *gh)
@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -381,6 +401,119 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
+ * demote_incompat_holders - demote incompatible demoteable holders
+ * @gl: the glock we want to promote
+ * @new_gh: the new holder to be promoted
+ */
+static void demote_incompat_holders(struct gfs2_glock *gl,
+ struct gfs2_holder *new_gh)
+{
+ struct gfs2_holder *gh, *tmp;
+
+ /*
+ * Demote incompatible holders before we make ourselves eligible.
+ * (This holder may or may not allow auto-demoting, but we don't want
+ * to demote the new holder before it's even granted.)
+ */
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ /*
+ * Since holders are at the front of the list, we stop when we
+ * find the first non-holder.
+ */
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
+ !may_grant(gl, new_gh, gh)) {
+ /*
+ * We should not recurse into do_promote because
+ * __gfs2_glock_dq only calls handle_callback,
+ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
+ */
+ __gfs2_glock_dq(gh);
+ }
+ }
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
+ gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * find_first_strong_holder - find the first non-demoteable holder
+ * @gl: the glock
+ *
+ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
+ */
+static inline struct gfs2_holder *
+find_first_strong_holder(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return NULL;
+ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/*
+ * gfs2_instantiate - Call the glops instantiate function
+ * @gh: The glock holder
+ *
+ * Returns: 0 if instantiate was successful, 2 if type specific operation is
+ * underway, or error.
+ */
+int gfs2_instantiate(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int ret;
+
+again:
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
+ return 0;
+
+ /*
+ * Since we unlock the lockref lock, we set a flag to indicate
+ * instantiate is in progress.
+ */
+ if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) {
+ wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ /*
+ * Here we just waited for a different instantiate to finish.
+ * But that may not have been successful, as when a process
+ * locks an inode glock _before_ it has an actual inode to
+ * instantiate into. So we check again. This process might
+ * have an inode to instantiate, so might be successful.
+ */
+ goto again;
+ }
+
+ ret = glops->go_instantiate(gh);
+ if (!ret)
+ clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
+ clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
+ return ret;
+}
+
+/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
@@ -392,44 +525,59 @@ static int do_promote(struct gfs2_glock *gl)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh, *tmp;
+ struct gfs2_holder *gh, *tmp, *first_gh;
+ bool incompat_holders_demoted = false;
+ bool lock_released;
int ret;
restart:
+ first_gh = find_first_strong_holder(gl);
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ lock_released = false;
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (may_grant(gl, gh)) {
- if (gh->gh_list.prev == &gl->gl_holders &&
- glops->go_lock) {
- spin_unlock(&gl->gl_lockref.lock);
- /* FIXME: eliminate this eventually */
- ret = glops->go_lock(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 1);
+ if (!may_grant(gl, first_gh, gh)) {
+ /*
+ * If we get here, it means we may not grant this holder for
+ * some reason. If this holder is the head of the list, it
+ * means we have a blocked holder at the head, so return 1.
+ */
+ if (list_is_first(&gh->gh_list, &gl->gl_holders))
+ return 1;
+ do_error(gl, 0);
+ break;
+ }
+ if (!incompat_holders_demoted) {
+ demote_incompat_holders(gl, first_gh);
+ incompat_holders_demoted = true;
+ first_gh = gh;
+ }
+ if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
+ !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
+ lock_released = true;
+ spin_unlock(&gl->gl_lockref.lock);
+ ret = gfs2_instantiate(gh);
+ spin_lock(&gl->gl_lockref.lock);
+ if (ret) {
+ if (ret == 1)
+ return 2;
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
gfs2_holder_wake(gh);
goto restart;
}
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 0);
- gfs2_holder_wake(gh);
- continue;
}
- if (gh->gh_list.prev == &gl->gl_holders)
- return 1;
- do_error(gl, 0);
- break;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_promote(gh);
+ gfs2_holder_wake(gh);
+ /*
+ * If we released the gl_lockref.lock the holders list may have
+ * changed. For that reason, we start again at the start of
+ * the holders queue.
+ */
+ if (lock_released)
+ goto restart;
}
return 0;
}
@@ -521,6 +669,8 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
/* Check for state != intended state */
if (unlikely(state != gl->gl_target)) {
+ if (gh && (ret & LM_OUT_CANCELED))
+ gfs2_holder_wake(gh);
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
/* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
@@ -723,23 +873,6 @@ out:
}
/**
- * find_first_holder - find the first "holder" gh
- * @gl: the glock
- */
-
-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- if (!list_empty(&gl->gl_holders)) {
- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
-/**
* run_queue - do all outstanding tasks related to a glock
* @gl: The glock in question
* @nonblock: True if we must not block in run_queue
@@ -822,7 +955,7 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
struct gfs2_holder gh;
int error;
- gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh);
+ __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_);
error = gfs2_glock_nq(&gh);
if (!error)
gfs2_glock_dq(&gh);
@@ -1057,7 +1190,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
atomic_inc(&sdp->sd_glock_disposal);
gl->gl_node.next = NULL;
- gl->gl_flags = 0;
+ gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0;
gl->gl_name = name;
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
gl->gl_lockref.count = 1;
@@ -1111,7 +1244,7 @@ out:
}
/**
- * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * __gfs2_holder_init - initialize a struct gfs2_holder in the default way
* @gl: the glock
* @state: the state we're requesting
* @flags: the modifier flags
@@ -1119,16 +1252,15 @@ out:
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
- struct gfs2_holder *gh)
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
+ struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
gh->gh_gl = gl;
- gh->gh_ip = _RET_IP_;
+ gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
- gh->gh_error = 0;
gh->gh_iflags = 0;
gfs2_glock_hold(gl);
}
@@ -1354,15 +1486,20 @@ __acquires(&gl->gl_lockref.lock)
GLOCK_BUG_ON(gl, true);
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- try_futile = !may_grant(gl, gh);
+ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
+ struct gfs2_holder *first_gh;
+
+ first_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, first_gh, gh);
+ }
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
+ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
goto trap_recursive;
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
@@ -1429,6 +1566,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
+ gh->gh_error = 0;
spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
@@ -1458,51 +1596,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
-/**
- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
- * @gh: the glock holder
- *
- */
+static inline bool needs_demote(struct gfs2_glock *gl)
+{
+ return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
+ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
+}
-void gfs2_glock_dq(struct gfs2_holder *gh)
+static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_lockref.lock);
/*
- * If we're in the process of file system withdraw, we cannot just
- * dequeue any glocks until our journal is recovered, lest we
- * introduce file system corruption. We need two exceptions to this
- * rule: We need to allow unlocking of nondisk glocks and the glock
- * for our own journal that needs recovery.
+ * This while loop is similar to function demote_incompat_holders:
+ * If the glock is due to be demoted (which may be from another node
+ * or even if this holder is GL_NOCACHE), the weak holders are
+ * demoted as well, allowing the glock to be demoted.
*/
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ while (gh) {
+ /*
+ * If we're in the process of file system withdraw, we cannot
+ * just dequeue any glocks until our journal is recovered, lest
+ * we introduce file system corruption. We need two exceptions
+ * to this rule: We need to allow unlocking of nondisk glocks
+ * and the glock for our own journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- if (list_empty(&gl->gl_holders) &&
- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
- !test_bit(GLF_DEMOTE, &gl->gl_flags))
- fast_path = 1;
+ /*
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
+ */
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
+
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
+ break;
+ }
+ /*
+ * If we have another strong holder (we cannot auto-demote)
+ * we are done. It keeps holding it until it is done.
+ */
+ if (find_first_strong_holder(gl))
+ break;
+
+ /*
+ * If we have a weak holder at the head of the list, it
+ * (and all others like it) must be auto-demoted. If there
+ * are no more weak holders, we exit the while loop.
+ */
+ gh = find_first_holder(gl);
+ }
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
gfs2_glock_add_to_lru(gl);
- trace_gfs2_glock_queue(gh, 0);
if (unlikely(!fast_path)) {
gl->gl_lockref.count++;
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1511,6 +1681,27 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
delay = gl->gl_hold_time;
__gfs2_glock_queue_work(gl, delay);
}
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ spin_unlock(&gl->gl_lockref.lock);
+ gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl);
+ wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
+ __gfs2_glock_dq(gh);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1687,6 +1878,33 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
+ /*
+ * Note 1: We cannot call demote_incompat_holders from handle_callback
+ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
+ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
+ * Plus, we only want to demote the holders if the request comes from
+ * a remote cluster node because local holder conflicts are resolved
+ * elsewhere.
+ *
+ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
+ * request that we set our state to UNLOCKED. Here we mock up a holder
+ * to make it look like someone wants the lock EX locally. Any SH
+ * and DF requests should be able to share the lock without demoting.
+ *
+ * Note 3: We only want to demote the demoteable holders when there
+ * are no more strong holders. The demoteable holders might as well
+ * keep the glock until the last strong holder is done with it.
+ */
+ if (!find_first_strong_holder(gl)) {
+ struct gfs2_holder mock_gh = {
+ .gh_gl = gl,
+ .gh_state = (state == LM_ST_UNLOCKED) ?
+ LM_ST_EXCLUSIVE : state,
+ .gh_iflags = BIT(HIF_HOLDER)
+ };
+
+ demote_incompat_holders(gl, &mock_gh);
+ }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -1893,10 +2111,10 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
do {
rhashtable_walk_start(&iter);
- while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
- if (gl->gl_name.ln_sbd == sdp &&
- lockref_get_not_dead(&gl->gl_lockref))
+ while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) {
+ if (gl->gl_name.ln_sbd == sdp)
examiner(gl);
+ }
rhashtable_walk_stop(&iter);
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
@@ -1919,7 +2137,7 @@ bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay)
void gfs2_cancel_delete_work(struct gfs2_glock *gl)
{
- if (cancel_delayed_work_sync(&gl->gl_delete)) {
+ if (cancel_delayed_work(&gl->gl_delete)) {
clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
gfs2_glock_put(gl);
}
@@ -1938,7 +2156,6 @@ static void flush_delete_work(struct gfs2_glock *gl)
&gl->gl_delete, 0);
}
}
- gfs2_glock_queue_work(gl, 0);
}
void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
@@ -1955,10 +2172,10 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
static void thaw_glock(struct gfs2_glock *gl)
{
- if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) {
- gfs2_glock_put(gl);
+ if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ return;
+ if (!lockref_get_not_dead(&gl->gl_lockref))
return;
- }
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
}
@@ -1974,9 +2191,12 @@ static void clear_glock(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- __gfs2_glock_queue_work(gl, 0);
+ if (!__lockref_is_dead(&gl->gl_lockref)) {
+ gl->gl_lockref.count++;
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ __gfs2_glock_queue_work(gl, 0);
+ }
spin_unlock(&gl->gl_lockref.lock);
}
@@ -2076,6 +2296,10 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
+ if (test_bit(HIF_MAY_DEMOTE, &iflags))
+ *p++ = 'D';
+ if (flags & GL_SKIP)
+ *p++ = 's';
*p = 0;
return buf;
}
@@ -2144,6 +2368,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'P';
if (test_bit(GLF_FREEING, gflags))
*p++ = 'x';
+ if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
+ *p++ = 'n';
+ if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags))
+ *p++ = 'N';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 31a8f2f649b5..4f8642301801 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -188,13 +190,21 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
extern void gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
-extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- u16 flags, struct gfs2_holder *gh);
+
+extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh,
+ unsigned long ip);
+static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh) {
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
+}
+
extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_instantiate(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -239,7 +249,7 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
{
int error;
- gfs2_holder_init(gl, state, flags, gh);
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
error = gfs2_glock_nq(gh);
if (error)
@@ -325,6 +335,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
spin_unlock(&gl->gl_lockref.lock);
}
+static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 79c621c7863d..392800f082a6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -228,7 +228,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
truncate_inode_pages_range(mapping, start, end);
- rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
}
static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
@@ -356,7 +355,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
struct address_space *mapping = gfs2_glock2aspace(gl);
truncate_inode_pages(mapping, 0);
if (ip) {
- set_bit(GIF_INVALID, &ip->i_flags);
+ set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
forget_all_cached_acls(&ip->i_inode);
security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
@@ -476,33 +475,29 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
error = gfs2_dinode_in(ip, dibh->b_data);
brelse(dibh);
- clear_bit(GIF_INVALID, &ip->i_flags);
-
return error;
}
/**
- * inode_go_lock - operation done after an inode lock is locked by a process
+ * inode_go_instantiate - read in an inode if necessary
* @gh: The glock holder
*
* Returns: errno
*/
-static int inode_go_lock(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
- if (!ip || (gh->gh_flags & GL_SKIP))
- return 0;
+ if (!ip) /* no inode to populate - read it in later */
+ goto out;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- error = gfs2_inode_refresh(ip);
- if (error)
- return error;
- }
+ error = gfs2_inode_refresh(ip);
+ if (error)
+ goto out;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
@@ -515,9 +510,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
wake_up(&sdp->sd_quota_wait);
- return 1;
+ error = 1;
}
+out:
return error;
}
@@ -740,7 +736,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_sync = inode_go_sync,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
- .go_lock = inode_go_lock,
+ .go_instantiate = inode_go_instantiate,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
@@ -750,7 +746,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
- .go_lock = gfs2_rgrp_go_lock,
+ .go_instantiate = gfs2_rgrp_go_instantiate,
.go_dump = gfs2_rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
@@ -767,6 +763,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
+ .go_dump = inode_go_dump,
.go_demote_ok = iopen_go_demote_ok,
.go_flags = GLOF_LRU | GLOF_NONDISK,
.go_subclass = 1,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0fe49770166e..8c00fb389ae5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -119,7 +119,6 @@ struct gfs2_rgrpd {
u32 rd_flags;
u32 rd_extfail_pt; /* extent failure point */
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
@@ -220,7 +219,7 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_lock) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -252,6 +251,7 @@ struct gfs2_lkstats {
enum {
/* States */
+ HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
@@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
enum {
GLF_LOCK = 1,
+ GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */
GLF_DEMOTE = 3,
GLF_PENDING_DEMOTE = 4,
GLF_DEMOTE_IN_PROGRESS = 5,
@@ -324,6 +325,7 @@ enum {
GLF_REPLY_PENDING = 9,
GLF_INITIAL = 10,
GLF_FROZEN = 11,
+ GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
@@ -370,7 +372,6 @@ struct gfs2_glock {
};
enum {
- GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
@@ -386,9 +387,8 @@ struct gfs2_inode {
u64 i_generation;
u64 i_eattr;
unsigned long i_flags; /* GIF_... */
- struct gfs2_glock *i_gl; /* Move into i_gh? */
+ struct gfs2_glock *i_gl;
struct gfs2_holder i_iopen_gh;
- struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_qadata *i_qadata; /* quota allocation data */
struct gfs2_holder i_rgd_gh;
struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3130f85d2b3f..c8ec876f33ea 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -40,37 +40,6 @@ static const struct inode_operations gfs2_file_iops;
static const struct inode_operations gfs2_dir_iops;
static const struct inode_operations gfs2_symlink_iops;
-static int iget_test(struct inode *inode, void *opaque)
-{
- u64 no_addr = *(u64 *)opaque;
-
- return GFS2_I(inode)->i_no_addr == no_addr;
-}
-
-static int iget_set(struct inode *inode, void *opaque)
-{
- u64 no_addr = *(u64 *)opaque;
-
- GFS2_I(inode)->i_no_addr = no_addr;
- inode->i_ino = no_addr;
- return 0;
-}
-
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
-{
- struct inode *inode;
-
-repeat:
- inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr);
- if (!inode)
- return inode;
- if (is_bad_inode(inode)) {
- iput(inode);
- goto repeat;
- }
- return inode;
-}
-
/**
* gfs2_set_iop - Sets inode operations
* @inode: The inode with correct i_mode filled in
@@ -104,6 +73,22 @@ static void gfs2_set_iop(struct inode *inode)
}
}
+static int iget_test(struct inode *inode, void *opaque)
+{
+ u64 no_addr = *(u64 *)opaque;
+
+ return GFS2_I(inode)->i_no_addr == no_addr;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+ u64 no_addr = *(u64 *)opaque;
+
+ GFS2_I(inode)->i_no_addr = no_addr;
+ inode->i_ino = no_addr;
+ return 0;
+}
+
/**
* gfs2_inode_lookup - Lookup an inode
* @sb: The super block
@@ -132,12 +117,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
{
struct inode *inode;
struct gfs2_inode *ip;
- struct gfs2_glock *io_gl = NULL;
struct gfs2_holder i_gh;
int error;
gfs2_holder_mark_uninitialized(&i_gh);
- inode = gfs2_iget(sb, no_addr);
+ inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -145,22 +129,30 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_glock *io_gl;
- error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE,
+ &ip->i_gl);
if (unlikely(error))
goto fail;
- flush_delayed_work(&ip->i_gl->gl_work);
- error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE,
+ &io_gl);
if (unlikely(error))
goto fail;
+
if (blktype != GFS2_BLKST_UNLINKED)
gfs2_cancel_delete_work(io_gl);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT,
+ &ip->i_iopen_gh);
+ gfs2_glock_put(io_gl);
+ if (unlikely(error))
+ goto fail;
if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) {
/*
* The GL_SKIP flag indicates to skip reading the inode
- * block. We read the inode with gfs2_inode_refresh
+ * block. We read the inode when instantiating it
* after possibly checking the block type.
*/
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
@@ -181,24 +173,21 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
}
}
- glock_set_object(ip->i_gl, ip);
- set_bit(GIF_INVALID, &ip->i_flags);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
- if (unlikely(error))
- goto fail;
- glock_set_object(ip->i_iopen_gh.gh_gl, ip);
- gfs2_glock_put(io_gl);
- io_gl = NULL;
+ set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
inode->i_atime.tv_nsec = 0;
+ glock_set_object(ip->i_gl, ip);
+
if (type == DT_UNKNOWN) {
/* Inode glock must be locked already */
- error = gfs2_inode_refresh(GFS2_I(inode));
- if (error)
+ error = gfs2_instantiate(&i_gh);
+ if (error) {
+ glock_clear_object(ip->i_gl, ip);
goto fail;
+ }
} else {
ip->i_no_formal_ino = no_formal_ino;
inode->i_mode = DT2IF(type);
@@ -206,27 +195,23 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
+ glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_set_iop(inode);
+ unlock_new_inode(inode);
}
if (no_formal_ino && ip->i_no_formal_ino &&
no_formal_ino != ip->i_no_formal_ino) {
- error = -ESTALE;
- if (inode->i_state & I_NEW)
- goto fail;
iput(inode);
- return ERR_PTR(error);
+ return ERR_PTR(-ESTALE);
}
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
-
return inode;
fail:
- if (io_gl)
- gfs2_glock_put(io_gl);
+ if (gfs2_holder_initialized(&ip->i_iopen_gh))
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
iget_failed(inode);
@@ -726,22 +711,26 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
if (error)
goto fail_free_inode;
- flush_delayed_work(&ip->i_gl->gl_work);
- glock_set_object(ip->i_gl, ip);
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (error)
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
- glock_set_object(io_gl, ip);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
+ BUG_ON(error);
+
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ if (error)
+ goto fail_gunlock3;
+
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
- goto fail_gunlock2;
+ goto fail_gunlock3;
if (blocks > 1) {
ip->i_eattr = ip->i_no_addr + 1;
@@ -750,12 +739,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
- if (error)
- goto fail_gunlock2;
-
+ glock_set_object(ip->i_gl, ip);
+ glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
- insert_inode_hash(inode);
free_vfs_inode = 0; /* After this point, the inode is no longer
considered free. Any failures need to undo
@@ -763,14 +749,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
posix_acl_release(default_acl);
default_acl = NULL;
}
if (acl) {
error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
posix_acl_release(acl);
acl = NULL;
}
@@ -778,11 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
&gfs2_initxattrs, NULL);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
error = link_dinode(dip, name, ip, &da);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
mark_inode_dirty(inode);
d_instantiate(dentry, inode);
@@ -797,21 +783,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_glock_dq_uninit(ghs + 1);
gfs2_glock_put(io_gl);
gfs2_qa_put(dip);
+ unlock_new_inode(inode);
return error;
-fail_gunlock3:
+fail_gunlock4:
+ glock_clear_object(ip->i_gl, ip);
glock_clear_object(io_gl, ip);
+fail_gunlock3:
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_gunlock2:
- glock_clear_object(io_gl, ip);
gfs2_glock_put(io_gl);
fail_free_inode:
if (ip->i_gl) {
- glock_clear_object(ip->i_gl, ip);
if (free_vfs_inode) /* else evict will do the put for us */
gfs2_glock_put(ip->i_gl);
}
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
posix_acl_release(default_acl);
@@ -825,7 +812,10 @@ fail_gunlock:
mark_inode_dirty(inode);
set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
&GFS2_I(inode)->i_flags);
- iput(inode);
+ if (inode->i_state & I_NEW)
+ iget_failed(inode);
+ else
+ iput(inode);
}
if (gfs2_holder_initialized(ghs + 1))
gfs2_glock_dq_uninit(ghs + 1);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 50578f881e6d..2559a79cf14b 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -261,6 +261,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
int req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
+ int error;
req = make_mode(gl->gl_name.ln_sbd, req_state);
lkf = make_flags(gl, flags, req);
@@ -279,8 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
* Submit the actual lock request.
*/
- return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
+again:
+ error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+ if (error == -EBUSY) {
+ msleep(20);
+ goto again;
+ }
+ return error;
}
static void gdlm_put_lock(struct gfs2_glock *gl)
@@ -312,8 +319,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
return;
}
+again:
error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
NULL, gl);
+ if (error == -EBUSY) {
+ msleep(20);
+ goto again;
+ }
+
if (error) {
fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
gl->gl_name.ln_type,
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ca0bb3a73912..6ba51cbb94cf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
- bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -489,11 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
{
struct bio *new;
- new = bio_alloc(GFP_NOIO, nr_iovecs);
- bio_copy_dev(new, prev);
+ new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(new, prev);
submit_bio(prev);
return new;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 72d30a682ece..d8bd1d48bd78 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -89,13 +89,15 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
}
const struct address_space_operations gfs2_meta_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
const struct address_space_operations gfs2_rgrp_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
@@ -222,9 +224,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
struct buffer_head *bh = *bhs;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, num);
+ bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
while (num > 0) {
bh = *bhs;
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@ -235,7 +236,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
num--;
}
bio->bi_end_io = gfs2_meta_read_endio;
- bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
}
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7f8410d8fdc1..c9b423c874a3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
ClearPageDirty(page);
lock_page(page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS);
bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- bio_set_dev(bio, sb->s_bdev);
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c3b00ba92ed2..801ad9f4f2be 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
/**
* gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
- * @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip)
{
+ struct inode *inode = &ip->i_inode;
+
down_write(&ip->i_rw_mutex);
- if ((wcount == NULL) || (atomic_read(wcount) <= 1))
+ if (atomic_read(&inode->i_writecount) <= 1)
gfs2_rs_deltree(&ip->i_res);
up_write(&ip->i_rw_mutex);
}
@@ -922,17 +923,17 @@ static int read_rindex_entry(struct gfs2_inode *ip)
spin_lock_init(&rgd->rd_rsspin);
mutex_init(&rgd->rd_mutex);
- error = compute_bitstructs(rgd);
- if (error)
- goto fail;
-
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
goto fail;
+ error = compute_bitstructs(rgd);
+ if (error)
+ goto fail_glock;
+
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
- rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
+ rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
if (rgd->rd_data > sdp->sd_max_rg_data)
sdp->sd_max_rg_data = rgd->rd_data;
spin_lock(&sdp->sd_rindex_spin);
@@ -944,6 +945,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
}
error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
gfs2_glock_put(rgd->rd_gl);
fail:
@@ -1185,8 +1187,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
- * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
+ * @gh: the glock holder representing the rgrpd to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
@@ -1194,10 +1196,11 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
- struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
struct gfs2_bitmap *bi;
unsigned int x, y;
@@ -1225,21 +1228,18 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
}
- if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
- gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
- rgrp_set_bitmap_flags(rgd);
- rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
- rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
- /* max out the rgrp allocation failure point */
- rgd->rd_extfail_pt = rgd->rd_free;
- }
+ gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+ rgrp_set_bitmap_flags(rgd);
+ rgd->rd_flags |= GFS2_RDF_CHECK;
+ rgd->rd_free_clone = rgd->rd_free;
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
+ /* max out the rgrp allocation failure point */
+ rgd->rd_extfail_pt = rgd->rd_free;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
rgd->rd_bits[0].bi_bh->b_data);
- }
- else if (sdp->sd_args.ar_rgrplvb) {
+ } else if (sdp->sd_args.ar_rgrplvb) {
if (!gfs2_rgrp_lvb_valid(rgd)){
gfs2_consist_rgrpd(rgd);
error = -EIO;
@@ -1257,19 +1257,18 @@ fail:
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
-
return error;
}
-static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
{
u32 rl_flags;
- if (rgd->rd_flags & GFS2_RDF_UPTODATE)
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags))
return 0;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
- return gfs2_rgrp_bh_get(rgd);
+ return gfs2_instantiate(gh);
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
rl_flags &= ~GFS2_RDF_MASK;
@@ -1280,7 +1279,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
rgrp_set_bitmap_flags(rgd);
rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
/* max out the rgrp allocation failure point */
rgd->rd_extfail_pt = rgd->rd_free;
rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
@@ -1288,16 +1287,6 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
return 0;
}
-int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
-{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
- return 0;
- return gfs2_rgrp_bh_get(rgd);
-}
-
/**
* gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
* @rgd: The resource group
@@ -1315,6 +1304,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
+ set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags);
}
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -1427,7 +1417,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
- minlen = max_t(u64, r.minlen,
+ minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
+ minlen = max_t(u64, minlen,
q->limits.discard_granularity) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
@@ -2113,7 +2104,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
gfs2_rgrp_congested(rs->rs_rgd, loops))
goto skip_rgrp;
if (sdp->sd_args.ar_rgrplvb) {
- error = update_rgrp_lvb(rs->rs_rgd);
+ error = update_rgrp_lvb(rs->rs_rgd,
+ &ip->i_rgd_gh);
if (unlikely(error)) {
rgrp_unlock_local(rs->rs_rgd);
gfs2_glock_dq_uninit(&ip->i_rgd_gh);
@@ -2128,8 +2120,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
(loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
goto skip_rgrp;
- if (sdp->sd_args.ar_rgrplvb)
- gfs2_rgrp_bh_get(rs->rs_rgd);
+ if (sdp->sd_args.ar_rgrplvb) {
+ error = gfs2_instantiate(&ip->i_rgd_gh);
+ if (error)
+ goto skip_rgrp;
+ }
/* Get a reservation if we don't already have one */
if (!gfs2_rs_active(rs))
@@ -2215,7 +2210,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
struct gfs2_rgrpd *rgd = rs->rs_rgd;
spin_lock(&rgd->rd_rsspin);
- BUG_ON(rgd->rd_reserved < rs->rs_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved);
rgd->rd_reserved -= rs->rs_reserved;
spin_unlock(&rgd->rd_rsspin);
rs->rs_reserved = 0;
@@ -2476,9 +2471,9 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
spin_unlock(&rbm.rgd->rd_rsspin);
goto rgrp_error;
}
- BUG_ON(rbm.rgd->rd_reserved < *nblocks);
- BUG_ON(rbm.rgd->rd_free_clone < *nblocks);
- BUG_ON(rbm.rgd->rd_free < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks);
rbm.rgd->rd_reserved -= *nblocks;
rbm.rgd->rd_free_clone -= *nblocks;
rbm.rgd->rd_free -= *nblocks;
@@ -2765,8 +2760,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
void rgrp_lock_local(struct gfs2_rgrpd *rgd)
{
- BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) &&
- !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
mutex_lock(&rgd->rd_mutex);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a6855fd796e0..46dd94e9e085 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
@@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6e00d15ef0a8..bdb773e5c88f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1244,11 +1244,9 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
if (ret)
return SHOULD_NOT_DELETE_DINODE;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- ret = gfs2_inode_refresh(ip);
- if (ret)
- return SHOULD_NOT_DELETE_DINODE;
- }
+ ret = gfs2_instantiate(gh);
+ if (ret)
+ return SHOULD_NOT_DELETE_DINODE;
/*
* The inode may have been recreated in the meantime.
@@ -1398,17 +1396,10 @@ out:
truncate_inode_pages_final(&inode->i_data);
if (ip->i_qadata)
gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
- if (ip->i_gl) {
- glock_clear_object(ip->i_gl, ip);
- wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
- gfs2_glock_add_to_lru(ip->i_gl);
- gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
- }
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
@@ -1421,13 +1412,20 @@ out:
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
+ if (ip->i_gl) {
+ glock_clear_object(ip->i_gl, ip);
+ wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
+ gfs2_glock_add_to_lru(ip->i_gl);
+ gfs2_glock_put_eventually(ip->i_gl);
+ ip->i_gl = NULL;
+ }
}
static struct inode *gfs2_alloc_inode(struct super_block *sb)
{
struct gfs2_inode *ip;
- ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+ ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
ip->i_flags = 0;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c0a34d9ddee4..d87ea98cf535 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/gfs2_ondisk.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
@@ -767,8 +767,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
wait_for_completion(&sdp->sd_kobj_unregister);
}
-static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
- struct kobj_uevent_env *env)
+static int gfs2_uevent(struct kobject *kobj, struct kobj_uevent_env *env)
{
struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
struct super_block *s = sdp->sd_vfs;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bd6c8e9e49db..a5deb9f86831 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -197,15 +197,14 @@ TRACE_EVENT(gfs2_demote_rq,
/* Promotion/grant of a glock */
TRACE_EVENT(gfs2_promote,
- TP_PROTO(const struct gfs2_holder *gh, int first),
+ TP_PROTO(const struct gfs2_holder *gh),
- TP_ARGS(gh, first),
+ TP_ARGS(gh),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( u64, glnum )
__field( u32, gltype )
- __field( int, first )
__field( u8, state )
),
@@ -213,14 +212,12 @@ TRACE_EVENT(gfs2_promote,
__entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
- __entry->first = first;
__entry->state = glock_trace_state(gh->gh_state);
),
- TP_printk("%u,%u glock %u:%llu promote %s %s",
+ TP_printk("%u,%u glock %u:%llu promote %s",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
(unsigned long long)__entry->glnum,
- __entry->first ? "first": "other",
glock_trace_name(__entry->state))
);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index cf345a86ef67..8241029a2a5d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -454,6 +454,7 @@ void gfs2_consist_inode_i(struct gfs2_inode *ip,
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
function, file, line);
+ gfs2_dump_glock(NULL, ip->i_gl, 1);
gfs2_withdraw(sdp);
}
@@ -475,6 +476,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
" function = %s, file = %s, line = %u\n",
(unsigned long long)rgd->rd_addr,
function, file, line);
+ gfs2_dump_glock(NULL, rgd->rd_gl, 1);
gfs2_withdraw(sdp);
}
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 4a95a92546a0..55f45e9b4930 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -159,7 +159,8 @@ static int hfs_writepages(struct address_space *mapping,
}
const struct address_space_operations hfs_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
@@ -169,7 +170,8 @@ const struct address_space_operations hfs_btree_aops = {
};
const struct address_space_operations hfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
@@ -462,8 +464,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- if (fd.entrylength < sizeof(struct hfs_cat_dir))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -483,8 +484,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- if (fd.entrylength < sizeof(struct hfs_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index cdf0edeeb278..8082eb01127c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -9,7 +9,7 @@
*/
#include <linux/cdrom.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/nls.h>
#include <linux/slab.h>
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 12d9bae39363..6764afa98a6f 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
{
struct hfs_inode_info *i;
- i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 456e87aec7fd..68b4240c6191 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -260,8 +260,10 @@ struct hfsplus_cat_folder {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct DInfo user_info;
- struct DXInfo finder_info;
+ struct_group_attr(info, __packed,
+ struct DInfo user_info;
+ struct DXInfo finder_info;
+ );
__be32 text_encoding;
__be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */
} __packed;
@@ -294,8 +296,10 @@ struct hfsplus_cat_file {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct FInfo user_info;
- struct FXInfo finder_info;
+ struct_group_attr(info, __packed,
+ struct FInfo user_info;
+ struct FXInfo finder_info;
+ );
__be32 text_encoding;
u32 reserved2;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6fef67c2a9f0..446a816aa8e1 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -156,7 +156,8 @@ static int hfsplus_writepages(struct address_space *mapping,
}
const struct address_space_operations hfsplus_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -166,7 +167,8 @@ const struct address_space_operations hfsplus_btree_aops = {
};
const struct address_space_operations hfsplus_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -509,8 +511,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
@@ -530,8 +531,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
} else if (type == HFSPLUS_FILE) {
struct hfsplus_cat_file *file = &entry.file;
- if (fd->entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_file));
@@ -588,8 +588,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
if (S_ISDIR(main_inode->i_mode)) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
@@ -614,8 +613,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
} else {
struct hfsplus_cat_file *file = &entry.file;
- if (fd.entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b9e3db3f855f..8479add998b5 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
{
struct hfsplus_inode_info *i;
- i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0350dc7821bf..0b8ad6586df5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
-#include <linux/genhd.h>
#include <asm/unaligned.h>
#include "hfsplus_fs.h"
@@ -64,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, sb->s_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (op != WRITE && data)
*data = (u8 *)buf + offset;
@@ -131,7 +128,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFSPLUS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e2855ceefd39..49891b12c415 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -296,7 +296,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
sizeof(hfsplus_cat_entry));
if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
if (size == folder_finderinfo_len) {
- memcpy(&entry.folder.user_info, value,
+ memcpy(&entry.folder.info, value,
folder_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
@@ -309,7 +309,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
}
} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
if (size == file_finderinfo_len) {
- memcpy(&entry.file.user_info, value,
+ memcpy(&entry.file.info, value,
file_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d5c9d886cd9f..14f9ac973a2e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,6 +14,7 @@
#include <linux/statfs.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include "hostfs.h"
@@ -222,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
+ hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
.readpage = hostfs_readpage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
};
@@ -924,6 +925,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_op = &hostfs_sbops;
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ err = super_setup_bdi(sb);
+ if (err)
+ goto out;
/* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index fb37f57130aa..99493a23c5d0 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -245,7 +245,8 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
const struct address_space_operations hpfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index d92c4af3e1b4..281dec8f636b 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -409,10 +409,10 @@ struct bplus_header
__le16 first_free; /* offset from start of header to
first free node in array */
union {
- struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
- subtree pointers */
- struct bplus_leaf_node external[0]; /* (external) 3-word entries giving
- sector runs */
+ /* (internal) 2-word entries giving subtree pointers */
+ DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal);
+ /* (external) 3-word entries giving sector runs */
+ DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external);
} u;
};
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a7dbfc892022..1cb89595b875 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep;
static struct inode *hpfs_alloc_inode(struct super_block *sb)
{
struct hpfs_inode_info *ei;
- ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cdfb1ae78a3f..dd3a088db11d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info);
}
@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -409,10 +410,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
struct vm_area_struct *vma;
/*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
+ * end == 0 indicates that the entire range after start should be
+ * unmapped. Note, end is exclusive, whereas the interval tree takes
+ * an inclusive "last".
*/
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
unsigned long v_offset;
unsigned long v_end;
@@ -1109,7 +1111,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
- p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+ p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
@@ -1143,7 +1145,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.migratepage = hugetlbfs_migrate_page,
.error_remove_page = hugetlbfs_error_remove_page,
};
@@ -1446,8 +1448,8 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct ucounts **ucounts,
- int creat_flags, int page_size_log)
+ vm_flags_t acctflag, int creat_flags,
+ int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
@@ -1458,22 +1460,19 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *ucounts = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *ucounts = current_ucounts();
- if (user_shm_lock(size, *ucounts)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ struct ucounts *ucounts = current_ucounts();
+
+ if (user_shm_lock(size, ucounts)) {
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
current->comm, current->pid);
- task_unlock(current);
- } else {
- *ucounts = NULL;
- return ERR_PTR(-EPERM);
+ user_shm_unlock(size, ucounts);
}
+ return ERR_PTR(-EPERM);
}
file = ERR_PTR(-ENOSPC);
@@ -1498,10 +1497,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*ucounts) {
- user_shm_unlock(size, *ucounts);
- *ucounts = NULL;
- }
return file;
}
diff --git a/fs/inode.c b/fs/inode.c
index ed0cab8a32db..9d9b422504d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -67,11 +67,6 @@ const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);
-/*
- * Statistics gathering..
- */
-struct inodes_stat_t inodes_stat;
-
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
@@ -106,13 +101,43 @@ long get_nr_dirty_inodes(void)
* Handle nr_inode sysctl
*/
#ifdef CONFIG_SYSCTL
-int proc_nr_inodes(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+/*
+ * Statistics gathering..
+ */
+static struct inodes_stat_t inodes_stat;
+
+static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
inodes_stat.nr_inodes = get_nr_inodes();
inodes_stat.nr_unused = get_nr_inodes_unused();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
+
+static struct ctl_table inodes_sysctls[] = {
+ {
+ .procname = "inode-nr",
+ .data = &inodes_stat,
+ .maxlen = 2*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_inodes,
+ },
+ {
+ .procname = "inode-state",
+ .data = &inodes_stat,
+ .maxlen = 7*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_inodes,
+ },
+ { }
+};
+
+static int __init init_fs_inode_sysctls(void)
+{
+ register_sysctl_init("fs", inodes_sysctls);
+ return 0;
+}
+early_initcall(init_fs_inode_sysctls);
#endif
static int no_open(struct inode *inode, struct file *file)
@@ -180,8 +205,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
- if (sb->s_type->fs_flags & FS_THP_SUPPORT)
- __set_bit(AS_THP_SUPPORT, &mapping->flags);
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@ -236,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb)
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
- inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
@@ -428,11 +451,20 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
{
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ return;
+ if (atomic_read(&inode->i_count))
+ return;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return;
+ if (!mapping_shrinkable(&inode->i_data))
+ return;
+
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- else
+ else if (rotate)
inode->i_state |= I_REFERENCED;
}
@@ -443,16 +475,11 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ __inode_add_lru(inode, false);
}
-
static void inode_lru_list_del(struct inode *inode)
{
-
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -524,6 +551,55 @@ void __remove_inode_hash(struct inode *inode)
}
EXPORT_SYMBOL(__remove_inode_hash);
+void dump_mapping(const struct address_space *mapping)
+{
+ struct inode *host;
+ const struct address_space_operations *a_ops;
+ struct hlist_node *dentry_first;
+ struct dentry *dentry_ptr;
+ struct dentry dentry;
+ unsigned long ino;
+
+ /*
+ * If mapping is an invalid pointer, we don't want to crash
+ * accessing it, so probe everything depending on it carefully.
+ */
+ if (get_kernel_nofault(host, &mapping->host) ||
+ get_kernel_nofault(a_ops, &mapping->a_ops)) {
+ pr_warn("invalid mapping:%px\n", mapping);
+ return;
+ }
+
+ if (!host) {
+ pr_warn("aops:%ps\n", a_ops);
+ return;
+ }
+
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
+ get_kernel_nofault(ino, &host->i_ino)) {
+ pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
+ return;
+ }
+
+ if (!dentry_first) {
+ pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
+ return;
+ }
+
+ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
+ if (get_kernel_nofault(dentry, dentry_ptr)) {
+ pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
+ a_ops, ino, dentry_ptr);
+ return;
+ }
+
+ /*
+ * if dentry is corrupted, the %pd handler may still crash,
+ * but it's unlikely that we reach here with a corrupt mapping
+ */
+ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
+}
+
void clear_inode(struct inode *inode)
{
/*
@@ -728,10 +804,6 @@ again:
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
@@ -747,31 +819,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
struct inode *inode = container_of(item, struct inode, i_lru);
/*
- * we are inverting the lru lock/inode->i_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock/inode->i_lock here, so use a
+ * trylock. If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
- * Referenced or dirty inodes are still in use. Give them another pass
- * through the LRU as we canot reclaim them now.
+ * Inodes can get referenced, redirtied, or repopulated while
+ * they're already on the LRU, and this can make them
+ * unreclaimable for a while. Remove them lazily here; iput,
+ * sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
+ (inode->i_state & ~I_REFERENCED) ||
+ !mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
- /* recently referenced inodes get one more pass */
+ /* Recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
+ /*
+ * On highmem systems, mapping_shrinkable() permits dropping
+ * page cache in order to free up struct inodes: lowmem might
+ * be under pressure before the cache inside the highmem zone.
+ */
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -1638,7 +1718,7 @@ static void iput_final(struct inode *inode)
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- inode_add_lru(inode);
+ __inode_add_lru(inode, true);
spin_unlock(&inode->i_lock);
return;
}
@@ -1782,12 +1862,13 @@ EXPORT_SYMBOL(generic_update_time);
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
-static int update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
if (inode->i_op->update_time)
return inode->i_op->update_time(inode, time, flags);
return generic_update_time(inode, time, flags);
}
+EXPORT_SYMBOL(inode_update_time);
/**
* atime_needs_update - update the access time
@@ -1857,7 +1938,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
now = current_time(inode);
- update_time(inode, &now, S_ATIME);
+ inode_update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
@@ -2002,7 +2083,7 @@ int file_update_time(struct file *file)
if (__mnt_want_write_file(file))
return 0;
- ret = update_time(inode, &now, sync_it);
+ ret = inode_update_time(inode, &now, sync_it);
__mnt_drop_write_file(file);
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index 3cd065c8a66b..08503dc68d2b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,22 +23,11 @@ struct pipe_inode_info;
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
- return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
- void *arg)
-{
-}
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
@@ -48,7 +37,7 @@ static inline int emergency_thaw_bdev(struct super_block *sb)
/*
* buffer.c
*/
-int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap);
/*
@@ -85,7 +74,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
* namespace.c
*/
extern struct vfsmount *lookup_mnt(const struct path *);
-extern int finish_automount(struct vfsmount *, struct path *);
+extern int finish_automount(struct vfsmount *, const struct path *);
extern int sb_prepare_remount_readonly(struct super_block *);
@@ -149,7 +138,6 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
@@ -170,11 +158,6 @@ extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
/*
- * read_write.c
- */
-extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
-
-/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
@@ -196,7 +179,9 @@ int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+
+int getname_statx_lookup_flags(int flags);
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 422a7ed6a9bd..32aeb2c581c5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,7 +13,8 @@
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
+#include <linux/audit.h>
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
@@ -47,7 +48,8 @@ struct io_worker {
struct io_wqe *wqe;
struct io_wq_work *cur_work;
- spinlock_t lock;
+ struct io_wq_work *next_work;
+ raw_spinlock_t lock;
struct completion ref_done;
@@ -74,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@@ -89,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
- struct io_wqe_acct acct[2];
+ struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@@ -140,6 +143,8 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
+static void io_wq_cancel_tw_create(struct io_wq *wq);
static bool io_worker_get(struct io_worker *worker)
{
@@ -174,24 +179,58 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
+ io_worker_ref_put(wq);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+ struct io_worker *worker;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker == data;
+}
+
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
+ while (1) {
+ struct callback_head *cb = task_work_cancel_match(wq->task,
+ io_task_worker_match, worker);
+
+ if (!cb)
+ break;
+ io_worker_cancel_cb(worker);
+ }
+
+ io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- preempt_disable();
+ raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
+ preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -200,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
+ bool ret = false;
+
+ raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
+ ret = true;
+ raw_spin_unlock(&acct->lock);
+
+ return ret;
}
/*
@@ -321,10 +365,22 @@ static bool io_queue_worker_create(struct io_worker *worker,
test_and_set_bit_lock(0, &worker->create_state))
goto fail_release;
+ atomic_inc(&wq->worker_refs);
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+ /*
+ * EXIT may have been set after checking it above, check after
+ * adding the task_work and remove any creation item if it is
+ * now set. wq exit does that too, but we can have added this
+ * work item after we canceled in io_wq_exit_workers().
+ */
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
+ io_wq_cancel_tw_create(wq);
+ io_worker_ref_put(wq);
return true;
+ }
+ io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
@@ -335,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
- __must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -343,24 +398,27 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(worker, acct, create_worker_cb);
- }
+ if (!atomic_dec_and_test(&acct->nr_running))
+ return;
+ if (!io_acct_run_queue(acct))
+ return;
+
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
* Worker will start processing some work. Move it to the busy list, if
* it's currently on the freelist
*/
-static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
- struct io_wq_work *work)
- __must_hold(wqe->lock)
+static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
+ raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
+ raw_spin_unlock(&wqe->lock);
}
}
@@ -385,9 +443,10 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
return work->flags >> IO_WQ_HASH_SHIFT;
}
-static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
struct io_wq *wq = wqe->wq;
+ bool ret = false;
spin_lock_irq(&wq->hash->wait.lock);
if (list_empty(&wqe->wait.entry)) {
@@ -395,14 +454,16 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
if (!test_bit(hash, &wq->hash->map)) {
__set_current_state(TASK_RUNNING);
list_del_init(&wqe->wait.entry);
+ ret = true;
}
}
spin_unlock_irq(&wq->hash->wait.lock);
+ return ret;
}
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
- __must_hold(wqe->lock)
+ __must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@@ -437,14 +498,21 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
}
if (stall_hash != -1U) {
+ bool unstalled;
+
/*
* Set this before dropping the lock to avoid racing with new
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
- io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
+ unstalled = io_wait_on_hash(wqe, stall_hash);
+ raw_spin_lock(&acct->lock);
+ if (unstalled) {
+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ if (wq_has_sleeper(&wqe->wq->hash->wait))
+ wake_up(&wqe->wq->hash->wait);
+ }
}
return NULL;
@@ -454,7 +522,9 @@ static bool io_flush_signals(void)
{
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
- tracehook_notify_signal();
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
return true;
}
return false;
@@ -468,15 +538,15 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
- spin_lock(&worker->lock);
+ raw_spin_lock(&worker->lock);
worker->cur_work = work;
- spin_unlock(&worker->lock);
+ worker->next_work = NULL;
+ raw_spin_unlock(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -485,7 +555,7 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
-get_next:
+
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
@@ -493,13 +563,25 @@ get_next:
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
+ raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
- if (work)
- __io_worker_busy(wqe, worker, work);
-
- raw_spin_unlock(&wqe->lock);
- if (!work)
+ raw_spin_unlock(&acct->lock);
+ if (work) {
+ __io_worker_busy(wqe, worker);
+
+ /*
+ * Make sure cancelation can find this, even before
+ * it becomes the active work. That avoids a window
+ * where the work has been removed from our general
+ * work list, but isn't yet discoverable as the
+ * current work item for this worker.
+ */
+ raw_spin_lock(&worker->lock);
+ worker->next_work = work;
+ raw_spin_unlock(&worker->lock);
+ } else {
break;
+ }
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -526,19 +608,15 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
+ /* serialize hash clear with wake_up() */
+ spin_lock_irq(&wq->hash->wait.lock);
clear_bit(hash, &wq->hash->map);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ spin_unlock_irq(&wq->hash->wait.lock);
if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait);
- raw_spin_lock(&wqe->lock);
- /* skip unnecessary unlock-lock wqe->lock */
- if (!work)
- goto get_next;
- raw_spin_unlock(&wqe->lock);
}
} while (work);
-
- raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -556,16 +634,16 @@ static int io_wqe_worker(void *data)
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
+ audit_alloc_kernel(current);
+
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock(&wqe->lock);
- if (io_acct_run_queue(acct)) {
+ while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
- goto loop;
- }
+
+ raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@@ -589,11 +667,10 @@ loop:
last_timeout = !ret;
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock(&wqe->lock);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
- }
+ audit_free(current);
io_worker_exit(worker);
return 0;
}
@@ -603,7 +680,7 @@ loop:
*/
void io_wq_worker_running(struct task_struct *tsk)
{
- struct io_worker *worker = tsk->pf_io_worker;
+ struct io_worker *worker = tsk->worker_private;
if (!worker)
return;
@@ -621,7 +698,7 @@ void io_wq_worker_running(struct task_struct *tsk)
*/
void io_wq_worker_sleeping(struct task_struct *tsk)
{
- struct io_worker *worker = tsk->pf_io_worker;
+ struct io_worker *worker = tsk->worker_private;
if (!worker)
return;
@@ -631,16 +708,13 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
-
- raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
struct task_struct *tsk)
{
- tsk->pf_io_worker = worker;
+ tsk->worker_private = worker;
worker->task = tsk;
set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
@@ -660,6 +734,13 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
static inline bool io_should_retry_thread(long err)
{
+ /*
+ * Prevent perpetual task_work retry, if the task (or its group) is
+ * exiting.
+ */
+ if (fatal_signal_pending(current))
+ return false;
+
switch (err) {
case -EAGAIN:
case -ERESTARTSYS:
@@ -697,10 +778,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
+ raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ ;
+ } else {
+ raw_spin_unlock(&wqe->lock);
}
- raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@@ -716,11 +799,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
- }
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -744,7 +824,7 @@ fail:
refcount_set(&worker->ref, 1);
worker->wqe = wqe;
- spin_lock_init(&worker->lock);
+ raw_spin_lock_init(&worker->lock);
init_completion(&worker->ref_done);
if (index == IO_WQ_ACCT_BOUND)
@@ -836,6 +916,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@@ -849,10 +930,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&acct->lock);
+ raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@@ -868,18 +951,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
- /* fatal condition, failed to create the first worker */
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_item,
- .data = work,
- .cancel_all = false,
- };
-
- if (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return;
}
raw_spin_unlock(&wqe->lock);
+
+ /* fatal condition, failed to create the first worker */
+ match.fn = io_wq_work_match_item,
+ match.data = work,
+ match.cancel_all = false,
+
+ io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@@ -902,6 +985,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}
+static bool __io_wq_worker_cancel(struct io_worker *worker,
+ struct io_cb_cancel_data *match,
+ struct io_wq_work *work)
+{
+ if (work && match->fn(work, match->data)) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ set_notify_signal(worker->task);
+ return true;
+ }
+
+ return false;
+}
+
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
@@ -910,13 +1006,11 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
* Hold the lock to avoid ->cur_work going out of scope, caller
* may dereference the passed in work.
*/
- spin_lock(&worker->lock);
- if (worker->cur_work &&
- match->fn(worker->cur_work, match->data)) {
- set_notify_signal(worker->task);
+ raw_spin_lock(&worker->lock);
+ if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
+ __io_wq_worker_cancel(worker, match, worker->next_work))
match->nr_running++;
- }
- spin_unlock(&worker->lock);
+ raw_spin_unlock(&worker->lock);
return match->nr_running && !match->cancel_all;
}
@@ -943,22 +1037,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
- __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
+ raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
+ raw_spin_unlock(&acct->lock);
return false;
}
@@ -968,17 +1063,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
{
int i;
retry:
- raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
if (match->cancel_all)
goto retry;
- return;
+ break;
}
}
- raw_spin_unlock(&wqe->lock);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1003,6 +1096,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
* First check pending list, if we're lucky we can just remove it
* from there. CANCEL_OK means that the work is returned as-new,
* no completion will be posted for it.
+ *
+ * Then check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempt to signal cancellation. The
+ * completion will run normally in this case.
+ *
+ * Do both of these while holding the wqe->lock, to ensure that
+ * we'll find a work item regardless of state.
*/
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1010,18 +1111,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
io_wqe_cancel_pending_work(wqe, &match);
if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
- }
-
- /*
- * Now check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- */
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
+ raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
+ raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
return IO_WQ_CANCEL_RUNNING;
}
@@ -1100,6 +1193,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
+ raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@@ -1140,28 +1234,26 @@ void io_wq_exit_start(struct io_wq *wq)
set_bit(IO_WQ_BIT_EXIT, &wq->state);
}
-static void io_wq_exit_workers(struct io_wq *wq)
+static void io_wq_cancel_tw_create(struct io_wq *wq)
{
struct callback_head *cb;
- int node;
-
- if (!wq->task)
- return;
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
- struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- acct = io_wqe_get_acct(worker);
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ io_worker_cancel_cb(worker);
}
+}
+
+static void io_wq_exit_workers(struct io_wq *wq)
+{
+ int node;
+
+ if (!wq->task)
+ return;
+
+ io_wq_cancel_tw_create(wq);
rcu_read_lock();
for_each_node(node) {
@@ -1278,17 +1370,22 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
- int i, node, prev = 0;
+ int prev[IO_WQ_ACCT_NR];
+ bool first_node = true;
+ int i, node;
BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ prev[i] = 0;
+
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1297,14 +1394,19 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
acct = &wqe->acct[i];
- prev = max_t(int, acct->max_workers, prev);
+ if (first_node)
+ prev[i] = max_t(int, acct->max_workers, prev[i]);
if (new_count[i])
acct->max_workers = new_count[i];
- new_count[i] = prev;
}
raw_spin_unlock(&wqe->lock);
+ first_node = false;
}
rcu_read_unlock();
+
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ new_count[i] = prev[i];
+
return 0;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bf5c4c533760..dbecd27656c7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv) \
+ for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+} while (0)
+
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -41,6 +52,28 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
list->last = node;
}
+/**
+ * wq_list_merge - merge the second list to the first one.
+ * @list0: the first list
+ * @list1: the second list
+ * Return the first node after mergence.
+ */
+static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
+ struct io_wq_work_list *list1)
+{
+ struct io_wq_work_node *ret;
+
+ if (!list0->first) {
+ ret = list1->first;
+ } else {
+ ret = list0->first;
+ list0->last->next = list1->first;
+ }
+ INIT_WQ_LIST(list0);
+ INIT_WQ_LIST(list1);
+ return ret;
+}
+
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
@@ -54,6 +87,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ node->next = list->first;
+ if (!node->next)
+ list->last = node;
+ WRITE_ONCE(list->first, node);
+}
+
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
@@ -69,6 +111,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ list->last->next = to->next;
+ to->next = list->first;
+ INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ if (!wq_list_empty(list)) {
+ __wq_list_splice(list, to);
+ return true;
+ }
+ return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_node *stack)
+{
+ node->next = stack->next;
+ stack->next = node;
+}
+
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
@@ -76,14 +143,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+ struct io_wq_work_node *node = stack->next;
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
- (list)->last = NULL; \
-} while (0)
+ stack->next = node->next;
+ return node;
+}
struct io_wq_work {
struct io_wq_work_node list;
@@ -155,6 +222,6 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER) &&
- current->pf_io_worker;
+ current->worker_private;
}
#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bc18af5e0a93..e0823f58f795 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -57,7 +57,7 @@
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
@@ -78,7 +78,8 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/security.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -103,11 +104,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
+ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +199,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -255,11 +261,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -305,28 +318,27 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
+ bool need_plug;
+ bool flush_cqes;
+ unsigned short submit_nr;
+ struct blk_plug plug;
+};
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
};
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -336,10 +348,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
+ unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -368,6 +381,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -377,14 +391,16 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -398,8 +414,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -417,10 +432,12 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +453,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +480,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -469,12 +493,13 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
- atomic_t inflight_tracked;
atomic_t in_idle;
spinlock_t task_lock;
struct io_wq_work_list task_list;
+ struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -486,8 +511,6 @@ struct io_poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
- bool canceled;
struct wait_queue_entry wait;
};
@@ -561,7 +584,8 @@ struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
u64 addr;
- u64 len;
+ u32 len;
+ u32 flags;
};
struct io_connect {
@@ -580,7 +604,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
+ size_t done_io;
};
struct io_open {
@@ -623,10 +647,10 @@ struct io_epoll {
struct io_splice {
struct file *file_out;
- struct file *file_in;
loff_t off_out;
loff_t off_in;
u64 len;
+ int splice_fd_in;
unsigned int flags;
};
@@ -644,7 +668,7 @@ struct io_statx {
int dfd;
unsigned int mask;
unsigned int flags;
- const char __user *filename;
+ struct filename *filename;
struct statx __user *buffer;
};
@@ -692,9 +716,10 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
+struct io_msg {
struct file *file;
- u32 cflags;
+ u64 user_data;
+ u32 len;
};
struct io_async_connect {
@@ -710,11 +735,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
- struct iovec fast_iov[UIO_FASTIOV];
- const struct iovec *free_iovec;
+struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
+ struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
+ const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -726,6 +755,7 @@ enum {
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
/* first byte is taken by user flags, shift it to not overlap */
REQ_F_FAIL_BIT = 8,
@@ -741,9 +771,13 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
+ REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
+ REQ_F_PARTIAL_IO_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -763,6 +797,8 @@ enum {
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* IOSQE_CQE_SKIP_SUCCESS */
+ REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
@@ -784,10 +820,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -796,6 +830,16 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
+ /* don't post CQEs while failing linked requests */
+ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
+ /* request has already done partial IO */
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
};
struct async_poll {
@@ -822,7 +866,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -852,39 +896,52 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
+ struct io_msg msg;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ /* fd initially, then cflags for completion */
+ union {
+ u32 cflags;
+ int fd;
+ };
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ union {
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ /* cache ->apoll->events */
+ int apoll_events;
+ };
+ atomic_t refs;
+ atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
- struct io_wq_work work;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+ /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+ struct io_kiocb *link;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -902,21 +959,24 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
+ unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* skip auditing */
+ unsigned audit_skip : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -930,6 +990,7 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -939,16 +1000,19 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -957,15 +1021,20 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {
+ .audit_skip = 1,
},
- [IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -983,18 +1052,24 @@ static const struct io_op_def io_op_defs[] = {
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
+ .audit_skip = 1,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .poll_exclusive = 1,
+ },
+ [IORING_OP_ASYNC_CANCEL] = {
+ .audit_skip = 1,
},
- [IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
@@ -1009,14 +1084,19 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {},
[IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {},
- [IORING_OP_STATX] = {},
+ [IORING_OP_FILES_UPDATE] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_STATX] = {
+ .audit_skip = 1,
+ },
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1025,39 +1105,50 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .audit_skip = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .audit_skip = 1,
},
[IORING_OP_OPENAT2] = {
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_REMOVE_BUFFERS] = {
+ .audit_skip = 1,
},
- [IORING_OP_PROVIDE_BUFFERS] = {},
- [IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
@@ -1067,6 +1158,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1079,8 +1173,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
+
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1089,13 +1183,16 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
struct io_uring_rsrc_update2 *up,
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed);
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned issue_flags);
+static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static void io_drop_inflight_file(struct io_kiocb *req);
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1103,6 +1200,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1154,12 +1252,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req)
return atomic_dec_and_test(&req->refs);
}
-static inline void req_ref_put(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- WARN_ON_ONCE(req_ref_put_and_test(req));
-}
-
static inline void req_ref_get(struct io_kiocb *req)
{
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
@@ -1167,6 +1259,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1180,48 +1278,192 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx,
+ unsigned int issue_flags)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ lockdep_assert_held(&ctx->uring_lock);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
+ } else {
+ percpu_ref_get(req->fixed_rsrc_refs);
+ }
+ }
+}
+
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
+{
+ struct io_buffer *kbuf = req->kbuf;
+ unsigned int cflags;
+
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ list_add(&kbuf->list, list);
+ req->kbuf = NULL;
+ return cflags;
+}
+
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
+{
+ lockdep_assert_held(&req->ctx->completion_lock);
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
+}
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
+{
+ unsigned int cflags;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
}
+
+ return cflags;
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- bool got = percpu_ref_tryget(ref);
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
+
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ return NULL;
+}
+
+static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+ /* don't recycle if we already did IO to this buffer */
+ if (req->flags & REQ_F_PARTIAL_IO)
+ return;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
+ __must_hold(&req->ctx->timeout_lock)
{
- struct io_kiocb *req;
+ if (task && head->task != task)
+ return false;
+ return cancel_all;
+}
+/*
+ * As io_match_task() but protected against racing with linked timeouts.
+ * User must not hold timeout_lock.
+ */
+static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+ bool cancel_all)
+{
if (task && head->task != task)
return false;
- if (cancel_all)
- return true;
+ return cancel_all;
+}
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
}
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
+ if (req->flags & REQ_F_CQE_SKIP) {
+ req->flags &= ~REQ_F_CQE_SKIP;
+ req->flags |= REQ_F_SKIP_LINK_CQES;
+ }
}
static inline void req_fail_link_node(struct io_kiocb *req, int res)
@@ -1230,7 +1472,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1242,7 +1484,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1255,18 +1497,16 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1293,6 +1533,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1300,16 +1547,18 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1318,13 +1567,15 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1348,26 +1599,13 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
-}
-
-static void io_req_track_inflight(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_INFLIGHT)) {
- req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
- }
+ return req->flags & REQ_F_FIXED_FILE;
}
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
@@ -1413,14 +1651,6 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- switch (req->opcode) {
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
- }
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1430,25 +1660,32 @@ static void io_prep_async_link(struct io_kiocb *req)
if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
- spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
io_for_each_link(cur, req)
io_prep_async_work(cur);
- spin_unlock(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->timeout_lock);
} else {
io_for_each_link(cur, req)
io_prep_async_work(cur);
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_state *state = &ctx->submit_state;
+
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ ctx->submit_state.flush_cqes = true;
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1465,8 +1702,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *locked)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1484,12 +1721,12 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_cqring_fill_event(req->ctx, req->user_data, status, 0);
+ io_fill_cqe_req(req, status, 0);
io_put_req_deferred(req);
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1503,16 +1740,15 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->timeout_lock);
- while (!list_empty(&ctx->timeout_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
u32 events_needed, events_got;
- struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, timeout.list);
if (io_is_timeout_noseq(req))
break;
@@ -1529,29 +1765,33 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
if (events_got < events_needed)
break;
- list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
}
ctx->cq_last_tm_flush = seq;
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1581,23 +1821,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1606,27 +1857,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1721,8 +1977,20 @@ static inline void io_get_task_refs(int nr)
io_task_refs_refill(tctx);
}
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
+{
+ struct io_uring_task *tctx = task->io_uring;
+ unsigned int refs = tctx->cached_refs;
+
+ if (refs) {
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
+ }
+}
+
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1749,13 +2017,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1771,20 +2037,33 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
-/* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
- return __io_cqring_fill_event(ctx, user_data, res, cflags);
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
+}
+
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
+{
+ ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
+ return __io_fill_cqe(ctx, user_data, res, cflags);
+}
+
+static void __io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
- spin_lock(&ctx->completion_lock);
- __io_cqring_fill_event(ctx, req->user_data, res, cflags);
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -1798,40 +2077,42 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
+ /*
+ * Selected buffer deallocation in io_clean_op() assumes that
+ * we don't hold ->completion_lock. Clean them here to avoid
+ * deadlocks.
+ */
+ io_put_kbuf_comp(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
}
}
-static inline bool io_req_needs_clean(struct io_kiocb *req)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ __io_req_complete_post(req, res, cflags);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1839,15 +2120,15 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -1878,7 +2159,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1887,7 +2168,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1896,20 +2176,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1918,38 +2185,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1958,35 +2241,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2010,8 +2286,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&link->timeout.list);
- io_cqring_fill_event(link->ctx, link->user_data,
- -ECANCELED, 0);
+ /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
+ io_fill_cqe_req(link, -ECANCELED, 0);
io_put_req_deferred(link);
return true;
}
@@ -2023,6 +2299,7 @@ static void io_fail_links(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
struct io_kiocb *nxt, *link = req->link;
+ bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
req->link = NULL;
while (link) {
@@ -2034,8 +2311,13 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link->ctx, link->user_data, res, 0);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
+ if (!ignore_cqes) {
+ link->flags &= ~REQ_F_CQE_SKIP;
+ io_fill_cqe_req(link, res, 0);
+ }
io_put_req_deferred(link);
link = nxt;
}
@@ -2052,8 +2334,8 @@ static bool io_disarm_next(struct io_kiocb *req)
req->flags &= ~REQ_F_ARM_LTIMEOUT;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
io_remove_next_linked(req);
- io_cqring_fill_event(link->ctx, link->user_data,
- -ECANCELED, 0);
+ /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
+ io_fill_cqe_req(link, -ECANCELED, 0);
io_put_req_deferred(link);
posted = true;
}
@@ -2072,98 +2354,158 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
percpu_ref_put(&ctx->refs);
}
+static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
+{
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+}
+
+static void handle_prev_tw_list(struct io_wq_work_node *node,
+ struct io_ring_ctx **ctx, bool *uring_locked)
+{
+ if (*ctx && !*uring_locked)
+ spin_lock(&(*ctx)->completion_lock);
+
+ do {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
+ if (req->ctx != *ctx) {
+ if (unlikely(!*uring_locked && *ctx))
+ ctx_commit_and_unlock(*ctx);
+
+ ctx_flush_and_put(*ctx, uring_locked);
+ *ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
+ percpu_ref_get(&(*ctx)->refs);
+ if (unlikely(!*uring_locked))
+ spin_lock(&(*ctx)->completion_lock);
+ }
+ if (likely(*uring_locked))
+ req->io_task_work.func(req, uring_locked);
+ else
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
+ node = next;
+ } while (node);
+
+ if (unlikely(!*uring_locked))
+ ctx_commit_and_unlock(*ctx);
+}
+
+static void handle_tw_list(struct io_wq_work_node *node,
+ struct io_ring_ctx **ctx, bool *locked)
+{
+ do {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
+ if (req->ctx != *ctx) {
+ ctx_flush_and_put(*ctx, locked);
+ *ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ *locked = mutex_trylock(&(*ctx)->uring_lock);
+ percpu_ref_get(&(*ctx)->refs);
+ }
+ req->io_task_work.func(req, locked);
+ node = next;
+ } while (node);
+}
+
static void tctx_task_work(struct callback_head *cb)
{
- bool locked = false;
+ bool uring_locked = false;
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
while (1) {
- struct io_wq_work_node *node;
+ struct io_wq_work_node *node1, *node2;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first &&
+ !tctx->prior_task_list.first && uring_locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
+ node1 = tctx->prior_task_list.first;
+ node2 = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- if (!node)
+ INIT_WQ_LIST(&tctx->prior_task_list);
+ if (!node2 && !node1)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
- if (!node)
+ if (!node2 && !node1)
break;
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- if (req->ctx != ctx) {
- ctx_flush_and_put(ctx, &locked);
- ctx = req->ctx;
- /* if not contended, grab and improve batching */
- locked = mutex_trylock(&ctx->uring_lock);
- percpu_ref_get(&ctx->refs);
- }
- req->io_task_work.func(req, &locked);
- node = next;
- } while (node);
+ if (node1)
+ handle_prev_tw_list(node1, &ctx, &uring_locked);
+ if (node2)
+ handle_tw_list(node2, &ctx, &uring_locked);
cond_resched();
}
- ctx_flush_and_put(ctx, &locked);
+ ctx_flush_and_put(ctx, &uring_locked);
+
+ /* relaxed read is enough as only the task itself sets ->in_idle */
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ io_uring_drop_tctx_refs(current);
}
-static void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
@@ -2174,8 +2516,13 @@ static void io_req_task_work_add(struct io_kiocb *req)
WARN_ON_ONCE(!tctx);
+ io_drop_inflight_file(req);
+
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ if (priority)
+ wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
+ else
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2192,15 +2539,15 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
+ node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2237,19 +2584,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->result = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2271,77 +2618,75 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
-
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ if (state->flush_cqes) {
+ spin_lock(&ctx->completion_lock);
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
+
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
+ }
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+ state->flush_cqes = false;
}
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2369,7 +2714,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
}
@@ -2388,82 +2733,35 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
-static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
-{
- unsigned int cflags;
-
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
- return cflags;
-}
-
-static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
-{
- struct io_buffer *kbuf;
-
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
- return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
-}
-
static inline bool io_run_task_work(void)
{
- if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
- tracehook_notify_signal();
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
return true;
}
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- struct io_kiocb *req, *tmp;
- LIST_HEAD(done);
- bool spin;
+ struct io_wq_work_node *pos, *start, *prev;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || force_nonspin)
+ poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2472,47 +2770,63 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
+ break;
}
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
+ else if (!pos)
+ return 0;
- return 0;
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ nr_events++;
+ if (unlikely(req->flags & REQ_F_CQE_SKIP))
+ continue;
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2559,7 +2873,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2568,11 +2882,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2597,9 +2915,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- iov_iter_restore(&rw->iter, &rw->iter_state);
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2641,9 +2959,13 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+ if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
kiocb_end_write(req);
- if (res != req->result) {
+ fsnotify_modify(req->file);
+ } else {
+ fsnotify_access(req->file);
+ }
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2655,33 +2977,29 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
return false;
}
-static void io_req_task_complete(struct io_kiocb *req, bool *locked)
+static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
+ io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+static void __io_complete_rw(struct io_kiocb *req, long res,
unsigned int issue_flags)
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2689,10 +3007,10 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
return;
req->result = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
}
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2703,12 +3021,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req->flags |= REQ_F_REISSUE;
return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2717,13 +3034,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2731,23 +3048,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
-
- if (list_req->file != req->file) {
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
@@ -2755,11 +3064,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2784,10 +3093,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2807,57 +3114,38 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
-
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- }
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
- if (unlikely(ret))
- return ret;
-
- /*
- * If the file is marked O_NONBLOCK, still allow retry for it if it
- * supports async. Otherwise it's impossible to use O_NONBLOCK files
- * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
- */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
- req->flags |= REQ_F_NOWAIT;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -2866,31 +3154,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
- return -EOPNOTSUPP;
-
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
- kiocb->ki_complete = io_complete_rw_iopoll;
- req->iopoll_completed = 0;
} else {
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
- }
-
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
+ req->rw.flags = READ_ONCE(sqe->rw_flags);
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -2912,18 +3183,34 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- kiocb->ki_complete(kiocb, ret, 0);
+ kiocb->ki_complete(kiocb, ret);
+ }
+}
+
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ if (kiocb->ki_pos != -1)
+ return &kiocb->ki_pos;
+
+ if (!(req->file->f_mode & FMODE_STREAM)) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
}
+
+ kiocb->ki_pos = 0;
+ return NULL;
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
+static void kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2931,29 +3218,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
}
if (req->flags & REQ_F_CUR_POS)
- req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
- __io_complete_rw(req, ret, 0, issue_flags);
+ req->file->f_pos = req->rw.kiocb.ki_pos;
+ if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
+ __io_complete_rw(req, ret, issue_flags);
else
- io_rw_done(kiocb, ret);
+ io_rw_done(&req->rw.kiocb, ret);
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req))
io_req_task_queue_reissue(req);
- } else {
- unsigned int cflags = io_put_rw_kbuf(req);
- struct io_ring_ctx *ctx = req->ctx;
-
- req_set_fail(req);
- if (!(issue_flags & IO_URING_F_NONBLOCK)) {
- mutex_lock(&ctx->uring_lock);
- __io_req_complete(req, issue_flags, ret, cflags);
- mutex_unlock(&ctx->uring_lock);
- } else {
- __io_req_complete(req, issue_flags, ret, cflags);
- }
- }
+ else
+ io_req_task_queue_fail(req, ret);
}
}
@@ -3015,15 +3291,18 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
return 0;
}
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+ unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx, issue_flags);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3049,59 +3328,64 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
mutex_lock(&ctx->uring_lock);
}
+static void io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
+{
+ struct list_head *list;
+
+ list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ INIT_LIST_HEAD(&bl->buf_list);
+ bl->bgid = bgid;
+ list_add(&bl->list, list);
+}
+
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
- struct io_buffer *head;
+ struct io_buffer *kbuf = req->kbuf;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
- io_ring_submit_lock(req->ctx, needs_lock);
+ io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&req->ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
- head = xa_load(&req->ctx->io_buffers, bgid);
- if (head) {
- if (!list_empty(&head->list)) {
- kbuf = list_last_entry(&head->list, struct io_buffer,
- list);
- list_del(&kbuf->list);
- } else {
- kbuf = head;
- xa_erase(&req->ctx->io_buffers, bgid);
- }
+ bl = io_buffer_get_list(ctx, bgid);
+ if (bl && !list_empty(&bl->buf_list)) {
+ kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+ list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3117,7 +3401,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3127,7 +3411,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3139,7 +3423,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3148,12 +3432,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3163,52 +3446,77 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
+ ret = io_import_fixed(req, rw, iter, issue_flags);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
}
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
- if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
+ if (ret)
+ return ERR_PTR(ret);
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return NULL;
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3225,6 +3533,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3233,9 +3542,12 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
+ ppos = io_kiocb_ppos(kiocb);
+
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3249,10 +3561,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3260,13 +3572,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3279,7 +3593,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3288,33 +3602,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
@@ -3322,10 +3639,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3333,10 +3650,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3344,19 +3662,11 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_READ)))
- return -EBADF;
- return io_prep_rw(req, sqe, READ);
-}
-
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3428,7 +3738,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3442,57 +3752,119 @@ static bool need_read_all(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
+static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = req->file;
+ int ret;
+
+ if (unlikely(!file || !(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+ kiocb->ki_flags = iocb_flags(file);
+ ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * If the file is marked O_NONBLOCK, still allow retry for it if it
+ * supports async. Otherwise it's impossible to use O_NONBLOCK files
+ * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+ */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+ req->flags |= REQ_F_NOWAIT;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+ return -EOPNOTSUPP;
+
+ kiocb->private = NULL;
+ kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ req->iopoll_completed = 0;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
+ kiocb->ki_complete = io_complete_rw;
+ }
+
+ return 0;
+}
+
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
+ struct io_async_rw *rw;
ssize_t ret, ret2;
+ loff_t *ppos;
+
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ /*
+ * Safe and required to re-import if we're using provided
+ * buffers, as we dropped the selected one before retry.
+ */
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ }
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
+ rw = req->async_data;
+ s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
}
- req->result = iov_iter_count(iter);
+ ret = io_rw_init_file(req, FMODE_READ);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
+ /* if we can poll, just do that */
+ if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3502,7 +3874,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3513,22 +3885,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
+ s = &rw->s;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
- if (iter != &rw->iter) {
- iter = &rw->iter;
- state = &rw->iter_state;
- }
do {
/*
@@ -3536,11 +3905,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(iter, ret);
- if (!iov_iter_count(iter))
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
break;
rw->bytes_done += ret;
- iov_iter_save_state(iter, state);
+ iov_iter_save_state(&s->iter, &s->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -3554,15 +3923,15 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
done:
- kiocb_done(kiocb, ret, issue_flags);
+ kiocb_done(req, ret, issue_flags);
out_free:
/* it's faster to check here then delegate to kfree */
if (iovec)
@@ -3570,53 +3939,52 @@ out_free:
return 0;
}
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
- return -EBADF;
- return io_prep_rw(req, sqe, WRITE);
-}
-
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
ssize_t ret, ret2;
+ loff_t *ppos;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
- iov_iter_restore(iter, state);
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- req->result = iov_iter_count(iter);
+ ret = io_rw_init_file(req, FMODE_WRITE);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
+
+ ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -3634,10 +4002,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3657,14 +4025,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
- kiocb_done(kiocb, ret2, issue_flags);
+ kiocb_done(req, ret2, issue_flags);
} else {
copy_iov:
- iov_iter_restore(iter, state);
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3800,7 +4168,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3849,7 +4217,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3899,7 +4267,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -3966,18 +4334,11 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sp->file_in = NULL;
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
-
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
-
- sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (!sp->file_in)
- return -EBADF;
- req->flags |= REQ_F_NEED_CLEANUP;
+ sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
return 0;
}
@@ -3992,20 +4353,29 @@ static int io_tee_prep(struct io_kiocb *req,
static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4024,15 +4394,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
@@ -4041,8 +4420,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4063,13 +4441,56 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_msg_ring_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
+ sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
+ req->msg.user_data = READ_ONCE(sqe->off);
+ req->msg.len = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx;
+ struct io_msg *msg = &req->msg;
+ bool filled;
+ int ret;
+
+ ret = -EBADFD;
+ if (req->file->f_op != &io_uring_fops)
+ goto done;
+
+ ret = -EOVERFLOW;
+ target_ctx = req->file->private_data;
+
+ spin_lock(&target_ctx->completion_lock);
+ filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
+ io_commit_cqring(target_ctx);
+ spin_unlock(&target_ctx->completion_lock);
+
+ if (filled) {
+ io_cqring_ev_posted(target_ctx);
+ ret = 0;
+ }
+
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ __io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
+ return 0;
+}
+
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
@@ -4129,6 +4550,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
req->sync.len);
if (ret < 0)
req_set_fail(req);
+ else
+ fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
}
@@ -4286,8 +4709,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
- int bgid, unsigned nbufs)
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
@@ -4296,18 +4719,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return 0;
/* the head kbuf is the list itself */
- while (!list_empty(&buf->list)) {
+ while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
- nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
- kfree(nxt);
if (++i == nbufs)
return i;
+ cond_resched();
}
i++;
- kfree(buf);
- xa_erase(&ctx->io_buffers, bgid);
return i;
}
@@ -4316,24 +4737,24 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head;
+ struct io_buffer_list *bl;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
- head = xa_load(&ctx->io_buffers, p->bgid);
- if (head)
- ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (bl)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4372,58 +4793,104 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+ struct io_buffer *buf;
+ struct page *page;
+ int bufs_in_page;
+
+ /*
+ * Completions that don't happen inline (eg not under uring_lock) will
+ * add to ->io_buffers_comp. If we don't have any free buffers, check
+ * the completion list and splice those entries first.
+ */
+ if (!list_empty_careful(&ctx->io_buffers_comp)) {
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp)) {
+ list_splice_init(&ctx->io_buffers_comp,
+ &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+ return 0;
+ }
+ spin_unlock(&ctx->completion_lock);
+ }
+
+ /*
+ * No free buffers and no completion entries either. Allocate a new
+ * page worth of buffer entries and add those to our freelist.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+
+ list_add(&page->lru, &ctx->io_buffers_pages);
+
+ buf = page_address(page);
+ bufs_in_page = PAGE_SIZE / sizeof(*buf);
+ while (bufs_in_page) {
+ list_add_tail(&buf->list, &ctx->io_buffers_cache);
+ buf++;
+ bufs_in_page--;
+ }
+
+ return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+ struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
- if (!buf)
+ if (list_empty(&ctx->io_buffers_cache) &&
+ io_refill_buffer_cache(ctx))
break;
-
+ buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+ list);
+ list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
+ buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- if (!*head) {
- INIT_LIST_HEAD(&buf->list);
- *head = buf;
- } else {
- list_add_tail(&buf->list, &(*head)->list);
- }
+ cond_resched();
}
- return i ? i : -ENOMEM;
+ return i ? 0 : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head, *list;
+ struct io_buffer_list *bl;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
- list = head = xa_load(&ctx->io_buffers, p->bgid);
-
- ret = io_add_buffers(p, &head);
- if (ret >= 0 && !list) {
- ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
- if (ret < 0)
- __io_remove_buffers(ctx, head, p->bgid, -1U);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (unlikely(!bl)) {
+ bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ io_buffer_add_list(ctx, bl, p->bgid);
}
+
+ ret = io_add_buffers(ctx, p, bl);
+err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4548,6 +5015,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ const char __user *path;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
@@ -4557,10 +5026,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd);
req->statx.mask = READ_ONCE(sqe->len);
- req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->statx.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.filename = getname_flags(path,
+ getname_statx_lookup_flags(req->statx.flags),
+ NULL);
+
+ if (IS_ERR(req->statx.filename)) {
+ int ret = PTR_ERR(req->statx.filename);
+
+ req->statx.filename = NULL;
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -4730,6 +5211,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -4756,8 +5239,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4771,17 +5255,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ }
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret)
- req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4817,13 +5302,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
-
- if (ret < min_ret)
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail(req);
+ }
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4916,23 +5401,11 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
-}
-
-static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
-{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4951,6 +5424,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -4963,25 +5438,34 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
- int min_ret = 0;
- int ret, cflags = 0;
+ int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4989,7 +5473,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5006,20 +5490,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ req_set_fail(req);
+ }
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_recv_kbuf(req);
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5032,8 +5526,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
struct socket *sock;
struct iovec iov;
unsigned flags;
- int min_ret = 0;
- int ret, cflags = 0;
+ int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
@@ -5041,7 +5534,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5065,16 +5558,29 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter);
ret = sock_recvmsg(sock, &msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_recv_kbuf(req);
- if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ }
+
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5093,8 +5599,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->nofile = rlimit(RLIMIT_NOFILE);
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
- (accept->flags & SOCK_CLOEXEC)))
+ if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -5112,9 +5617,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (req->file->f_flags & O_NONBLOCK)
- req->flags |= REQ_F_NOWAIT;
-
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5172,7 +5674,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5188,7 +5690,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5241,52 +5743,23 @@ struct io_poll_table {
int error;
};
-static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
- __poll_t mask, io_req_tw_func_t func)
-{
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
-
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
-
- list_del_init(&poll->wait.entry);
+#define IO_POLL_CANCEL_FLAG BIT(31)
+#define IO_POLL_REF_MASK GENMASK(30, 0)
- req->result = mask;
- req->io_task_work.func = func;
-
- /*
- * If this fails, then the task is exiting. When a task exits, the
- * work gets canceled, so just cancel this request as well instead
- * of executing it. We can't safely execute it anyway, as we may not
- * have the needed state needed for it anyway.
- */
- io_req_task_work_add(req);
- return 1;
+/*
+ * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
+ * bump it and acquire ownership. It's disallowed to modify requests while not
+ * owning it, that prevents from races for enqueueing task_work's and b/w
+ * arming poll and wakeups.
+ */
+static inline bool io_poll_get_ownership(struct io_kiocb *req)
+{
+ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
-static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
- __acquires(&req->ctx->completion_lock)
+static void io_poll_mark_cancelled(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
- WRITE_ONCE(poll->canceled, true);
-
- if (!req->result && !READ_ONCE(poll->canceled)) {
- struct poll_table_struct pt = { ._key = poll->events };
-
- req->result = vfs_poll(req->file, &pt) & poll->events;
- }
-
- spin_lock(&ctx->completion_lock);
- if (!req->result && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
- return true;
- }
-
- return false;
+ atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}
static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
@@ -5304,148 +5777,268 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
return &req->apoll->poll;
}
-static void io_poll_remove_double(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
+static void io_poll_req_insert(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = io_poll_get_double(req);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct hlist_head *list;
- lockdep_assert_held(&req->ctx->completion_lock);
+ list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ hlist_add_head(&req->hash_node, list);
+}
+
+static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
+ wait_queue_func_t wake_func)
+{
+ poll->head = NULL;
+#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+ /* mask in events that we always want/need */
+ poll->events = events | IO_POLL_UNMASK;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+}
- if (poll && poll->head) {
- struct wait_queue_head *head = poll->head;
+static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
+{
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
+ if (head) {
spin_lock_irq(&head->lock);
list_del_init(&poll->wait.entry);
- if (poll->wait.private)
- req_ref_put(req);
poll->head = NULL;
spin_unlock_irq(&head->lock);
}
}
-static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
+static void io_poll_remove_entries(struct io_kiocb *req)
+{
+ /*
+ * Nothing to do if neither of those flags are set. Avoid dipping
+ * into the poll/apoll/double cachelines if we can.
+ */
+ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+ return;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ if (req->flags & REQ_F_SINGLE_POLL)
+ io_poll_remove_entry(io_poll_get_single(req));
+ if (req->flags & REQ_F_DOUBLE_POLL)
+ io_poll_remove_entry(io_poll_get_double(req));
+ rcu_read_unlock();
+}
+
+/*
+ * All poll tw should go through this. Checks for poll events, manages
+ * references, does rewait, etc.
+ *
+ * Returns a negative error on failure. >0 when no action require, which is
+ * either spurious wakeup or multishot CQE is served. 0 when it's done with
+ * the request, then the mask is stored in req->result.
+ */
+static int io_poll_check_events(struct io_kiocb *req, bool locked)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags = IORING_CQE_F_MORE;
- int error;
+ int v;
+
+ /* req->task == current here, checking PF_EXITING is safe */
+ if (unlikely(req->task->flags & PF_EXITING))
+ io_poll_mark_cancelled(req);
+
+ do {
+ v = atomic_read(&req->poll_refs);
- if (READ_ONCE(req->poll.canceled)) {
- error = -ECANCELED;
- req->poll.events |= EPOLLONESHOT;
+ /* tw handler should be the owner, and so have some references */
+ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
+ return 0;
+ if (v & IO_POLL_CANCEL_FLAG)
+ return -ECANCELED;
+
+ if (!req->result) {
+ struct poll_table_struct pt = { ._key = req->apoll_events };
+ unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
+
+ if (unlikely(!io_assign_file(req, flags)))
+ return -EBADF;
+ req->result = vfs_poll(req->file, &pt) & req->apoll_events;
+ }
+
+ /* multishot, just fill an CQE and proceed */
+ if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
+ __poll_t mask = mangle_poll(req->result & req->apoll_events);
+ bool filled;
+
+ spin_lock(&ctx->completion_lock);
+ filled = io_fill_cqe_aux(ctx, req->user_data, mask,
+ IORING_CQE_F_MORE);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (unlikely(!filled))
+ return -ECANCELED;
+ io_cqring_ev_posted(ctx);
+ } else if (req->result) {
+ return 0;
+ }
+
+ /*
+ * Release all references, retry if someone tried to restart
+ * task_work while we were executing it.
+ */
+ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
+
+ return 1;
+}
+
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = io_poll_check_events(req, *locked);
+ if (ret > 0)
+ return;
+
+ if (!ret) {
+ req->result = mangle_poll(req->result & req->poll.events);
} else {
- error = mangle_poll(mask);
- }
- if (req->poll.events & EPOLLONESHOT)
- flags = 0;
- if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
- req->poll.events |= EPOLLONESHOT;
- flags = 0;
+ req->result = ret;
+ req_set_fail(req);
}
- if (flags & IORING_CQE_F_MORE)
- ctx->cq_extra++;
- return !(flags & IORING_CQE_F_MORE);
+ io_poll_remove_entries(req);
+ spin_lock(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ __io_req_complete_post(req, req->result, 0);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
+static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
- bool done;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = io_poll_check_events(req, *locked);
+ if (ret > 0)
+ return;
+
+ io_poll_remove_entries(req);
+ spin_lock(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ spin_unlock(&ctx->completion_lock);
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
+ if (!ret)
+ io_req_task_submit(req, locked);
+ else
+ io_req_complete_failed(req, ret);
}
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt;
+ req->result = mask;
+ /*
+ * This is useful for poll that is armed on behalf of another
+ * request, and where the wakeup path could be on a different
+ * CPU. We want to avoid pulling in req->apoll->events for that
+ * case.
+ */
+ req->apoll_events = events;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->io_task_work.func = io_poll_task_func;
+ else
+ req->io_task_work.func = io_apoll_task_func;
- if (io_poll_rewait(req, &req->poll)) {
- spin_unlock(&ctx->completion_lock);
- } else {
- bool done;
+ trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
+ io_req_task_work_add(req, false);
+}
- if (req->poll.done) {
- spin_unlock(&ctx->completion_lock);
- return;
- }
- done = __io_poll_complete(req, req->result);
- if (done) {
- io_poll_remove_double(req);
- hash_del(&req->hash_node);
- req->poll.done = true;
- } else {
- req->result = 0;
- add_wait_queue(req->poll.head, &req->poll.wait);
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+{
+ if (io_poll_get_ownership(req))
+ __io_poll_execute(req, res, events);
+}
- if (done) {
- nxt = io_put_req_find_next(req);
- if (nxt)
- io_req_task_submit(nxt, locked);
- }
- }
+static void io_poll_cancel_req(struct io_kiocb *req)
+{
+ io_poll_mark_cancelled(req);
+ /* kick tw, which should complete the request */
+ io_poll_execute(req, 0, 0);
}
-static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *key)
+#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
+#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = io_poll_get_single(req);
+ struct io_kiocb *req = wqe_to_req(wait);
+ struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+ wait);
__poll_t mask = key_to_poll(key);
- unsigned long flags;
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
- if (!(poll->events & EPOLLONESHOT))
- return poll->wait.func(&poll->wait, mode, sync, key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0, poll->events);
- list_del_init(&wait->entry);
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
- if (poll->head) {
- bool done;
+ /* for instances that support it check for an event match first */
+ if (mask && !(mask & poll->events))
+ return 0;
- spin_lock_irqsave(&poll->head->lock, flags);
- done = list_empty(&poll->wait.entry);
- if (!done)
+ if (io_poll_get_ownership(req)) {
+ /* optional, saves extra locking for removal in tw handler */
+ if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
- /* make sure double remove sees this as being gone */
- wait->private = NULL;
- spin_unlock_irqrestore(&poll->head->lock, flags);
- if (!done) {
- /* use wait func handler, so it matches the rq type */
- poll->wait.func(&poll->wait, mode, sync, key);
+ poll->head = NULL;
+ if (wqe_is_double(wait))
+ req->flags &= ~REQ_F_DOUBLE_POLL;
+ else
+ req->flags &= ~REQ_F_SINGLE_POLL;
}
+ __io_poll_execute(req, mask, poll->events);
}
- req_ref_put(req);
return 1;
}
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
- wait_queue_func_t wake_func)
-{
- poll->head = NULL;
- poll->done = false;
- poll->canceled = false;
-#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
- /* mask in events that we always want/need */
- poll->events = events | IO_POLL_UNMASK;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
-}
-
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct wait_queue_head *head,
struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
+ unsigned long wqe_private = (unsigned long) req;
/*
* The file being polled uses multiple waitqueues for poll handling
@@ -5453,10 +6046,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
* if this happens.
*/
if (unlikely(pt->nr_entries)) {
- struct io_poll_iocb *poll_one = poll;
+ struct io_poll_iocb *first = poll;
/* double add on the same waitqueue head, ignore */
- if (poll_one->head == head)
+ if (first->head == head)
return;
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
@@ -5465,25 +6058,25 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -EINVAL;
return;
}
- /*
- * Can't handle multishot for double wait for now, turn it
- * into one-shot mode.
- */
- if (!(poll_one->events & EPOLLONESHOT))
- poll_one->events |= EPOLLONESHOT;
+
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
return;
}
- io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
- req_ref_get(req);
- poll->wait.private = req;
+ /* mark as double wq entry */
+ wqe_private |= 1;
+ req->flags |= REQ_F_DOUBLE_POLL;
+ io_init_poll_iocb(poll, first->events, first->wait.func);
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
+ req->flags |= REQ_F_SINGLE_POLL;
pt->nr_entries++;
poll->head = head;
+ poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(head, &poll->wait);
@@ -5491,103 +6084,79 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
add_wait_queue(head, &poll->wait);
}
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
+ __io_queue_proc(&pt->req->poll, pt, head,
+ (struct io_poll_iocb **) &pt->req->async_data);
}
-static void io_async_task_func(struct io_kiocb *req, bool *locked)
+static int __io_arm_poll_handler(struct io_kiocb *req,
+ struct io_poll_iocb *poll,
+ struct io_poll_table *ipt, __poll_t mask)
{
- struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
-
- trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
-
- if (io_poll_rewait(req, &apoll->poll)) {
- spin_unlock(&ctx->completion_lock);
- return;
- }
-
- hash_del(&req->hash_node);
- io_poll_remove_double(req);
- apoll->poll.done = true;
- spin_unlock(&ctx->completion_lock);
-
- if (!READ_ONCE(apoll->poll.canceled))
- io_req_task_submit(req, locked);
- else
- io_req_complete_failed(req, -ECANCELED);
-}
-
-static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = &req->apoll->poll;
-
- trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
- key_to_poll(key));
-
- return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
-}
-
-static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
- struct io_poll_iocb *poll,
- struct io_poll_table *ipt, __poll_t mask,
- wait_queue_func_t wake_func)
- __acquires(&ctx->completion_lock)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool cancel = false;
+ int v;
INIT_HLIST_NODE(&req->hash_node);
- io_init_poll_iocb(poll, mask, wake_func);
+ io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
- poll->wait.private = req;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
ipt->nr_entries = 0;
+ /*
+ * Take the ownership to delay any tw execution up until we're done
+ * with poll arming. see io_poll_get_ownership().
+ */
+ atomic_set(&req->poll_refs, 1);
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
- if (unlikely(!ipt->nr_entries) && !ipt->error)
- ipt->error = -EINVAL;
+
+ if (mask && (poll->events & EPOLLONESHOT)) {
+ io_poll_remove_entries(req);
+ /* no one else has access to the req, forget about the ref */
+ return mask;
+ }
+ if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
+ io_poll_remove_entries(req);
+ if (!ipt->error)
+ ipt->error = -EINVAL;
+ return 0;
+ }
spin_lock(&ctx->completion_lock);
- if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
- io_poll_remove_double(req);
- if (likely(poll->head)) {
- spin_lock_irq(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
- if (ipt->error)
- cancel = true;
- ipt->error = 0;
- mask = 0;
- }
- if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
- list_del_init(&poll->wait.entry);
- else if (cancel)
- WRITE_ONCE(poll->canceled, true);
- else if (!poll->done) /* actually waiting for an event */
- io_poll_req_insert(req);
- spin_unlock_irq(&poll->head->lock);
+ io_poll_req_insert(req);
+ spin_unlock(&ctx->completion_lock);
+
+ if (mask) {
+ /* can't multishot if failed, just queue the event we've got */
+ if (unlikely(ipt->error || !ipt->nr_entries))
+ poll->events |= EPOLLONESHOT;
+ __io_poll_execute(req, mask, poll->events);
+ return 0;
}
- return mask;
+ /*
+ * Release ownership. If someone tried to queue a tw while it was
+ * locked, kick it off for them.
+ */
+ v = atomic_dec_return(&req->poll_refs);
+ if (unlikely(v & IO_POLL_REF_MASK))
+ __io_poll_execute(req, 0, poll->events);
+ return 0;
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
+
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
enum {
@@ -5596,24 +6165,21 @@ enum {
IO_APOLL_READY
};
-static int io_arm_poll_handler(struct io_kiocb *req)
+static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
+ __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+ int ret;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5621,80 +6187,46 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
-
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (def->poll_exclusive)
+ mask |= EPOLLEXCLUSIVE;
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del_init(&apoll->poll.wait.entry);
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
- io_req_set_refcount(req);
- ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
- io_async_wake);
- spin_unlock(&ctx->completion_lock);
+ io_kbuf_recycle(req, issue_flags);
+
+ ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
-static bool __io_poll_remove_one(struct io_kiocb *req,
- struct io_poll_iocb *poll, bool do_cancel)
- __must_hold(&req->ctx->completion_lock)
-{
- bool do_complete = false;
-
- if (!poll->head)
- return false;
- spin_lock_irq(&poll->head->lock);
- if (do_cancel)
- WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait.entry)) {
- list_del_init(&poll->wait.entry);
- do_complete = true;
- }
- spin_unlock_irq(&poll->head->lock);
- hash_del(&req->hash_node);
- return do_complete;
-}
-
-static bool io_poll_remove_one(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- bool do_complete;
-
- io_poll_remove_double(req);
- do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
-
- if (do_complete) {
- io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
- io_commit_cqring(req->ctx);
- req_set_fail(req);
- io_put_req_deferred(req);
- }
- return do_complete;
-}
-
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int posted = 0, i;
+ bool found = false;
+ int i;
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -5702,16 +6234,15 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_match_task(req, tsk, cancel_all))
- posted += io_poll_remove_one(req);
+ if (io_match_task_safe(req, tsk, cancel_all)) {
+ hlist_del_init(&req->hash_node);
+ io_poll_cancel_req(req);
+ found = true;
+ }
}
}
spin_unlock(&ctx->completion_lock);
-
- if (posted)
- io_cqring_ev_posted(ctx);
-
- return posted != 0;
+ return found;
}
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
@@ -5732,19 +6263,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
return NULL;
}
+static bool io_poll_disarm(struct io_kiocb *req)
+ __must_hold(&ctx->completion_lock)
+{
+ if (!io_poll_get_ownership(req))
+ return false;
+ io_poll_remove_entries(req);
+ hash_del(&req->hash_node);
+ return true;
+}
+
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
bool poll_only)
__must_hold(&ctx->completion_lock)
{
- struct io_kiocb *req;
+ struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
- req = io_poll_find(ctx, sqe_addr, poll_only);
if (!req)
return -ENOENT;
- if (io_poll_remove_one(req))
- return 0;
-
- return -EALREADY;
+ io_poll_cancel_req(req);
+ return 0;
}
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
@@ -5794,23 +6332,6 @@ static int io_poll_update_prep(struct io_kiocb *req,
return 0;
}
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = &req->poll;
-
- return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
-}
-
-static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
-}
-
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
@@ -5823,105 +6344,71 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
flags = READ_ONCE(sqe->len);
if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
+ if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
+ return -EINVAL;
io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
+ req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_iocb *poll = &req->poll;
- struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- __poll_t mask;
- bool done;
+ int ret;
ipt.pt._qproc = io_poll_queue_proc;
- mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
- io_poll_wake);
-
- if (mask) { /* no async, we'd stolen it */
- ipt.error = 0;
- done = io_poll_complete(req, mask);
- }
- spin_unlock(&ctx->completion_lock);
-
- if (mask) {
- io_cqring_ev_posted(ctx);
- if (done)
- io_put_req(req);
- }
- return ipt.error;
+ ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+ ret = ret ?: ipt.error;
+ if (ret)
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
}
static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *preq;
- bool completing;
- int ret;
+ int ret2, ret = 0;
+ bool locked;
spin_lock(&ctx->completion_lock);
preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
- if (!preq) {
- ret = -ENOENT;
- goto err;
- }
-
- if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
- completing = true;
- ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
- goto err;
- }
-
- /*
- * Don't allow racy completion with singleshot, as we cannot safely
- * update those. For multishot, if we're racing with completion, just
- * let completion re-add it.
- */
- completing = !__io_poll_remove_one(preq, &preq->poll, false);
- if (completing && (preq->poll.events & EPOLLONESHOT)) {
- ret = -EALREADY;
- goto err;
- }
- /* we now have a detached poll request. reissue. */
- ret = 0;
-err:
- if (ret < 0) {
+ if (!preq || !io_poll_disarm(preq)) {
spin_unlock(&ctx->completion_lock);
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
- }
- /* only mask one event flags, keep behavior flags */
- if (req->poll_update.update_events) {
- preq->poll.events &= ~0xffff;
- preq->poll.events |= req->poll_update.events & 0xffff;
- preq->poll.events |= IO_POLL_UNMASK;
+ ret = preq ? -EALREADY : -ENOENT;
+ goto out;
}
- if (req->poll_update.update_user_data)
- preq->user_data = req->poll_update.new_user_data;
spin_unlock(&ctx->completion_lock);
- /* complete update request, we're done with it */
- io_req_complete(req, ret);
-
- if (!completing) {
- ret = io_poll_add(preq, issue_flags);
- if (ret < 0) {
- req_set_fail(preq);
- io_req_complete(preq, ret);
+ if (req->poll_update.update_events || req->poll_update.update_user_data) {
+ /* only mask one event flags, keep behavior flags */
+ if (req->poll_update.update_events) {
+ preq->poll.events &= ~0xffff;
+ preq->poll.events |= req->poll_update.events & 0xffff;
+ preq->poll.events |= IO_POLL_UNMASK;
}
+ if (req->poll_update.update_user_data)
+ preq->user_data = req->poll_update.new_user_data;
+
+ ret2 = io_poll_add(preq, issue_flags);
+ /* successfully updated, don't complete poll request */
+ if (!ret2)
+ goto out;
}
- return 0;
-}
-static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
-{
- req_set_fail(req);
- io_req_complete_post(req, -ETIME, 0);
+ req_set_fail(preq);
+ preq->result = -ECANCELED;
+ locked = !(issue_flags & IO_URING_F_UNLOCKED);
+ io_req_task_complete(preq, &locked);
+out:
+ if (ret < 0)
+ req_set_fail(req);
+ /* complete update request, we're done with it */
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@ -5938,8 +6425,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_read(&req->ctx->cq_timeouts) + 1);
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
- req->io_task_work.func = io_req_task_timeout;
- io_req_task_work_add(req);
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
+
+ req->result = -ETIME;
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -5974,10 +6465,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
-
- req_set_fail(req);
- io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
- io_put_req_deferred(req);
+ io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
@@ -6065,6 +6553,8 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
return -EFAULT;
+ if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
+ return -EINVAL;
} else if (tr->flags) {
/* timeout removal doesn't support flags */
return -EINVAL;
@@ -6126,7 +6616,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6137,7 +6628,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6147,6 +6640,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
+ if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&req->timeout.list);
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
@@ -6260,16 +6757,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- if (ret != -ENOENT)
- return ret;
+ /*
+ * Fall-through even for -EALREADY, as we may have poll armed
+ * that need unarming.
+ */
+ if (!ret)
+ return 0;
spin_lock(&ctx->completion_lock);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
+ if (ret != -ENOENT)
+ goto out;
+
spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, sqe_addr);
spin_unlock_irq(&ctx->timeout_lock);
- if (ret != -ENOENT)
- goto out;
- ret = io_poll_cancel(ctx, sqe_addr, false);
out:
spin_unlock(&ctx->completion_lock);
return ret;
@@ -6294,6 +6796,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6302,7 +6805,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6311,7 +6814,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6338,6 +6841,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6346,11 +6850,12 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.nr = 0;
up.tags = 0;
up.resv = 0;
+ up.resv2 = 0;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6366,11 +6871,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- return io_read_prep(req, sqe);
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- return io_write_prep(req, sqe);
+ return io_prep_rw(req, sqe);
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
@@ -6435,6 +6939,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_MSG_RING:
+ return io_msg_ring_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6444,9 +6950,14 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_req_prep_async(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_async_setup)
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
+ /* assign early for deferred execution for non-fixed file */
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ req->file = io_file_get_normal(req, req->fd);
+ if (!def->needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6478,92 +6989,57 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ spin_lock(&ctx->completion_lock);
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+ spin_unlock(&ctx->completion_lock);
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
-
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
+ spin_unlock(&ctx->completion_lock);
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data);
+ trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ spin_lock(&req->ctx->completion_lock);
+ io_put_kbuf_comp(req);
+ spin_unlock(&req->ctx->completion_lock);
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6586,11 +7062,6 @@ static void io_clean_op(struct io_kiocb *req)
kfree(io->free_iov);
break;
}
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(req->splice.file_in);
- break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
if (req->open.filename)
@@ -6614,6 +7085,10 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->hardlink.oldpath);
putname(req->hardlink.newpath);
break;
+ case IORING_OP_STATX:
+ if (req->statx.filename)
+ putname(req->statx.filename);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6621,26 +7096,46 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->apoll);
req->apoll = NULL;
}
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
+{
+ if (req->file || !io_op_defs[req->opcode].needs_file)
+ return true;
+
+ if (req->flags & REQ_F_FIXED_FILE)
+ req->file = io_file_get_fixed(req, req->fd, issue_flags);
+ else
+ req->file = io_file_get_normal(req, req->fd);
+ if (req->file)
+ return true;
+
+ req_set_fail(req);
+ req->result = -EBADF;
+ return false;
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely(!io_assign_file(req, issue_flags)))
+ return -EBADF;
+
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_entry(req->opcode);
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, issue_flags);
@@ -6751,18 +7246,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_MSG_RING:
+ ret = io_msg_ring(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
}
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_exit(!ret, ret);
+
if (creds)
revert_creds(creds);
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6778,8 +7279,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
- int ret = 0;
+ int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
@@ -6791,24 +7295,49 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+fail:
+ io_req_task_queue_fail(req, err);
+ return;
+ }
+ if (!io_assign_file(req, issue_flags)) {
+ err = -EBADF;
+ work->flags |= IO_WQ_WORK_CANCEL;
+ goto fail;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6832,62 +7361,68 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
-static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned int issue_flags)
{
- struct file *file;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = NULL;
unsigned long file_ptr;
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
+ goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx, 0);
+out:
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
return file;
}
-static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+/*
+ * Drop the file for requeue operations. Only used of req->file is the
+ * io_uring descriptor itself.
+ */
+static void io_drop_inflight_file(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_INFLIGHT)) {
+ fput(req->file);
+ req->file = NULL;
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
+}
+
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
- trace_io_uring_file_get(ctx, fd);
+ trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
/* we don't allow fixed io_uring files */
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
+ if (file && file->f_op == &io_uring_fops)
+ req->flags |= REQ_F_INFLIGHT;
return file;
}
-static inline struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed)
-{
- if (fixed)
- return io_file_get_fixed(ctx, req, fd);
- else
- return io_file_get_normal(ctx, req, fd);
-}
-
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
struct io_kiocb *prev = req->timeout.prev;
- int ret;
+ int ret = -ENOENT;
if (prev) {
- ret = io_try_cancel_userdata(req, prev->user_data);
+ if (!(req->task->flags & PF_EXITING))
+ ret = io_try_cancel_userdata(req, prev->user_data);
io_req_complete_post(req, ret ?: -ETIME, 0);
io_put_req(prev);
} else {
@@ -6921,7 +7456,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -6947,67 +7482,64 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req, 0)) {
+ case IO_APOLL_READY:
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ case IO_APOLL_OK:
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -7018,6 +7550,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7027,9 +7568,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -7044,16 +7582,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * REQ_F_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7061,49 +7618,72 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
+ ctx->drain_disabled = true;
+ if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (ctx->drain_disabled)
+ return -EOPNOTSUPP;
+ io_init_req_drain(req);
+ }
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ req->fd = READ_ONCE(sqe->fd);
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
+ int ret;
+
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
return -EINVAL;
get_cred(req->creds);
+ ret = security_uring_override_creds(req->creds);
+ if (ret) {
+ put_cred(req->creds);
+ return ret;
+ }
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7115,7 +7695,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7138,14 +7719,10 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7171,33 +7748,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7209,7 +7785,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7261,45 +7838,51 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
- if (io_submit_sqe(ctx, req, sqe))
- break;
- }
+ if (io_submit_sqe(ctx, req, sqe)) {
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+ break;
+ }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7338,16 +7921,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7367,7 +7949,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7410,6 +7992,8 @@ static int io_sq_thread(void *data)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
+ audit_alloc_kernel(current);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
@@ -7424,7 +8008,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7438,17 +8022,24 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !current->task_works) {
+ if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
+
+ /*
+ * Ensure the store of the wakeup flag is not
+ * reordered with the load of the SQ tail
+ */
+ smp_mb();
+
if (io_sqring_entries(ctx)) {
needs_sched = false;
break;
@@ -7475,6 +8066,8 @@ static int io_sq_thread(void *data)
io_run_task_work();
mutex_unlock(&sqd->lock);
+ audit_free(current);
+
complete(&sqd->exited);
do_exit(0);
}
@@ -7518,17 +8111,17 @@ static int io_run_task_work_sig(void)
{
if (io_run_task_work())
return 1;
- if (!signal_pending(current))
- return 0;
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
- return -EINTR;
+ if (task_sigpending(current))
+ return -EINTR;
+ return 0;
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- signed long *timeout)
+ ktime_t timeout)
{
int ret;
@@ -7540,8 +8133,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- *timeout = schedule_timeout(*timeout);
- return !*timeout ? -ETIME : 1;
+ if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+ return -ETIME;
+ return 1;
}
/*
@@ -7554,7 +8148,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ ktime_t timeout = KTIME_MAX;
int ret;
do {
@@ -7565,14 +8159,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7586,6 +8172,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -7602,7 +8196,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7621,7 +8215,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7650,16 +8244,21 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
bool first_add = false;
+ unsigned long delay = HZ;
spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
+ /* if we are mid-quiesce then do not delay */
+ if (node->rsrc_data->quiesce)
+ delay = 0;
+
while (!list_empty(&ctx->rsrc_ref_list)) {
node = list_first_entry(&ctx->rsrc_ref_list,
struct io_rsrc_node, node);
@@ -7672,10 +8271,10 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (first_add)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+static struct io_rsrc_node *io_rsrc_node_alloc(void)
{
struct io_rsrc_node *ref_node;
@@ -7696,10 +8295,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7723,11 +8325,12 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
- ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
+ ctx->rsrc_backup_node = io_rsrc_node_alloc();
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7750,7 +8353,15 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- break;
+ if (atomic_read(&data->refs) > 0) {
+ /*
+ * it has been revived by another thread while
+ * we were unlocked
+ */
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ break;
+ }
}
atomic_inc(&data->refs);
@@ -7783,9 +8394,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8045,10 +8656,15 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr_files; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr; i++) {
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (file)
+ fput(file);
+ }
} else {
kfree_skb(skb);
+ free_uid(fpl->user);
kfree(fpl);
}
@@ -8174,8 +8790,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
io_ring_submit_lock(ctx, lock_ring);
spin_lock(&ctx->completion_lock);
- io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
- ctx->cq_extra++;
+ io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -8337,13 +8952,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
+ u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
return -ENOMEM;
- prsrc->tag = *io_get_tag_slot(data, idx);
+ prsrc->tag = *tag_slot;
+ *tag_slot = 0;
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
@@ -8353,12 +8970,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8399,7 +9016,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
@@ -8409,11 +9026,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
- int ret, i;
+ int ret;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -8424,8 +9042,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
goto out;
- i = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ offset = array_index_nospec(offset, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, offset);
ret = -EBADF;
if (!file_slot->file_ptr)
goto out;
@@ -8439,7 +9057,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
return ret;
}
@@ -8481,8 +9099,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (file_slot->file_ptr) {
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
+ err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
if (err)
break;
file_slot->file_ptr = 0;
@@ -8507,7 +9124,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, up->offset + done) = tag;
+ *io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
@@ -8555,8 +9172,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8565,8 +9182,16 @@ static int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8575,6 +9200,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8582,10 +9208,10 @@ static int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
- atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -8598,13 +9224,14 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -8627,6 +9254,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
bool attached;
+ ret = security_uring_sqpoll();
+ if (ret)
+ return ret;
+
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
@@ -8753,10 +9384,9 @@ static void io_mem_free(void *ptr)
static void *io_mem_alloc(size_t size)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
- __GFP_NORETRY | __GFP_ACCOUNT;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- return (void *) __get_free_pages(gfp_flags, get_order(size));
+ return (void *) __get_free_pages(gfp, get_order(size));
}
static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
@@ -9152,7 +9782,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
i = array_index_nospec(offset, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, offset,
+ err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->rsrc_node, ctx->user_bufs[i]);
if (unlikely(err)) {
io_buffer_unmap(ctx, &imu);
@@ -9171,33 +9801,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
- ctx->cq_ev_fd = NULL;
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
return ret;
}
-
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9206,38 +9858,49 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- unsigned long index;
+ int i;
- xa_for_each(&ctx->io_buffers, index, buf) {
- __io_remove_buffers(ctx, buf, index, -1U);
- cond_resched();
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
+ struct list_head *list = &ctx->io_buffers[i];
+
+ while (!list_empty(list)) {
+ struct io_buffer_list *bl;
+
+ bl = list_first_entry(list, struct io_buffer_list, list);
+ __io_remove_buffers(ctx, bl, -1U);
+ list_del(&bl->list);
+ kfree(bl);
+ }
}
-}
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
+ while (!list_empty(&ctx->io_buffers_pages)) {
+ struct page *page;
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
+ page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
}
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9247,7 +9910,19 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+ struct async_poll *apoll;
+
+ while (!list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del(&apoll->poll.wait.entry);
+ kfree(apoll);
+ }
+}
+
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9256,6 +9931,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9267,8 +9943,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
- mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
+ io_flush_apoll_cache(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -9279,6 +9956,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9301,6 +9979,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
+ kfree(ctx->io_buffers);
kfree(ctx);
}
@@ -9309,7 +9988,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9356,7 +10035,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9371,14 +10050,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9407,6 +10086,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9433,7 +10114,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9447,8 +10127,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9470,7 +10150,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9517,30 +10197,20 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_task_cancel *cancel = data;
- bool ret;
-
- if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
- struct io_ring_ctx *ctx = req->ctx;
- /* protect against races with linked timeouts */
- spin_lock(&ctx->completion_lock);
- ret = io_match_task(req, cancel->task, cancel->all);
- spin_unlock(&ctx->completion_lock);
- } else {
- ret = io_match_task(req, cancel->task, cancel->all);
- }
- return ret;
+ return io_match_task_safe(req, cancel->task, cancel->all);
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task(de->req, task, cancel_all)) {
+ if (io_match_task_safe(de->req, task, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -9558,7 +10228,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9582,9 +10252,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9608,7 +10278,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9683,7 +10353,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9706,7 +10376,7 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
@@ -9718,7 +10388,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
}
if (wq) {
/*
- * Must be after io_uring_del_task_file() (removes nodes under
+ * Must be after io_uring_del_tctx_node() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
io_wq_put_and_exit(wq);
@@ -9729,27 +10399,16 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
if (tracked)
- return atomic_read(&tctx->inflight_tracked);
+ return 0;
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
-{
- struct io_uring_task *tctx = task->io_uring;
- unsigned int refs = tctx->cached_refs;
-
- if (refs) {
- tctx->cached_refs = 0;
- percpu_counter_sub(&tctx->inflight, refs);
- put_task_struct_many(task, refs);
- }
-}
-
/*
* Find any io_uring ctx that this task has registered or done IO on, and cancel
- * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9788,8 +10447,10 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
cancel_all);
}
- prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
+ io_run_task_work();
io_uring_drop_tctx_refs(current);
+
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
@@ -9799,10 +10460,14 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
- atomic_dec(&tctx->in_idle);
io_uring_clean_tctx(tctx);
if (cancel_all) {
+ /*
+ * We shouldn't run task_works after cancel, so just leave
+ * ->in_idle set for normal exit.
+ */
+ atomic_dec(&tctx->in_idle);
/* for exec all current's requests should be gone, kill tctx */
__io_uring_free(current);
}
@@ -9813,6 +10478,144 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ for (offset = start; offset < end; offset++) {
+ offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[offset])
+ continue;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (file->f_op != &io_uring_fops) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ tctx->registered_rings[offset] = file;
+ return offset;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ int ret, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ int start, end;
+
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (reg.resv) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (reg.offset == -1U) {
+ start = 0;
+ end = IO_RINGFD_REG_MAX;
+ } else {
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+ start = reg.offset;
+ end = start + 1;
+ }
+
+ ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+ if (ret < 0)
+ break;
+
+ reg.offset = ret;
+ if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret = 0, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ }
+
+ return i ? i : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -9842,7 +10645,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -9925,6 +10728,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
+ if (arg.pad)
+ return -EINVAL;
*sig = u64_to_user_ptr(arg.sigmask);
*argsz = arg.sigmask_sz;
*ts = u64_to_user_ptr(arg.ts);
@@ -9943,12 +10748,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || fd >= IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10022,12 +10843,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -10059,11 +10881,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe;
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10105,7 +10975,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10113,14 +10986,22 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
hlist_for_each_entry(req, list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- req->task->task_works != NULL);
+ task_work_pending(req->task));
+ }
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
}
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10144,8 +11025,8 @@ static const struct file_operations io_uring_fops = {
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10221,8 +11102,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return ERR_PTR(ret);
#endif
- file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC);
+ file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
@@ -10234,8 +11115,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10330,7 +11211,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS;
+ IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
+ IORING_FEAT_LINKED_FILE;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -10381,7 +11263,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -10393,7 +11275,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10449,8 +11332,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10540,8 +11423,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
__u32 tmp;
int err;
- if (up->resv)
- return -EINVAL;
if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW;
err = io_rsrc_node_switch_start(ctx);
@@ -10567,6 +11448,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
memset(&up, 0, sizeof(up));
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
+ if (up.resv || up.resv2)
+ return -EINVAL;
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}
@@ -10579,12 +11462,12 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- if (!up.nr || up.resv)
+ if (!up.nr || up.resv || up.resv2)
return -EINVAL;
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10610,8 +11493,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10627,7 +11510,15 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
if (len > cpumask_size())
len = cpumask_size();
- if (copy_from_user(new_mask, arg, len)) {
+ if (in_compat_syscall()) {
+ ret = compat_get_bitmap(cpumask_bits(new_mask),
+ (const compat_ulong_t __user *)arg,
+ len * 8 /* CHAR_BIT */);
+ } else {
+ ret = copy_from_user(new_mask, arg, len);
+ }
+
+ if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
@@ -10637,7 +11528,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10647,8 +11538,8 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
@@ -10684,10 +11575,11 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- memcpy(ctx->iowq_limits, new_count, sizeof(new_count));
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
- ret = -EINVAL;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
@@ -10729,57 +11621,6 @@ err:
return ret;
}
-static bool io_register_op_must_quiesce(int op)
-{
- switch (op) {
- case IORING_REGISTER_BUFFERS:
- case IORING_UNREGISTER_BUFFERS:
- case IORING_REGISTER_FILES:
- case IORING_UNREGISTER_FILES:
- case IORING_REGISTER_FILES_UPDATE:
- case IORING_REGISTER_PROBE:
- case IORING_REGISTER_PERSONALITY:
- case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_FILES2:
- case IORING_REGISTER_FILES_UPDATE2:
- case IORING_REGISTER_BUFFERS2:
- case IORING_REGISTER_BUFFERS_UPDATE:
- case IORING_REGISTER_IOWQ_AFF:
- case IORING_UNREGISTER_IOWQ_AFF:
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- return false;
- default:
- return true;
- }
-}
-
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
-{
- long ret;
-
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab the
- * uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
- break;
- ret = io_run_task_work_sig();
- } while (ret >= 0);
- mutex_lock(&ctx->uring_lock);
-
- if (ret)
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
- return ret;
-}
-
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -10803,12 +11644,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}
- if (io_register_op_must_quiesce(opcode)) {
- ret = io_ctx_quiesce(ctx);
- if (ret)
- return ret;
- }
-
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
@@ -10832,17 +11667,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
- case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
- ret = io_eventfd_register(ctx, arg);
- if (ret)
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
break;
- if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
- ctx->eventfd_async = 1;
- else
- ctx->eventfd_async = 0;
+ ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -10909,16 +11743,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- if (io_register_op_must_quiesce(opcode)) {
- /* bring the ctx back to life */
- percpu_ref_reinit(&ctx->refs);
- reinit_completion(&ctx->ref_comp);
- }
return ret;
}
@@ -10944,8 +11779,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
@@ -11002,6 +11836,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 504e69578112..80ac36aea913 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -173,7 +173,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (*len == 0)
return -EINVAL;
- if (start > maxbytes)
+ if (start >= maxbytes)
return -EFBIG;
/*
@@ -236,9 +236,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
if (!src_file.file)
return -EBADF;
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
- goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
olen, 0);
if (cloned < 0)
@@ -247,7 +244,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
-fdput:
fdput(src_file);
return ret;
}
@@ -430,7 +426,7 @@ static int ioctl_file_dedupe_range(struct file *file,
goto out;
}
- size = offsetof(struct file_dedupe_range __user, info[count]);
+ size = offsetof(struct file_dedupe_range, info[count]);
if (size > PAGE_SIZE) {
ret = -ENOMEM;
goto out;
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 4143a3ff89db..fc070184b7fa 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- buffered-io.o \
+ iter.o
+iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
fiemap.o \
- iter.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9cc5798423d1..8ce8720093b9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -21,9 +21,11 @@
#include "../internal.h"
+#define IOEND_BATCH_SIZE 4096
+
/*
- * Structure allocated for each page or THP when block size < page size
- * to track sub-page uptodate status and I/O completions.
+ * Structure allocated for each folio when block size < folio size
+ * to track sub-folio uptodate status and I/O completions.
*/
struct iomap_page {
atomic_t read_bytes_pending;
@@ -32,27 +34,20 @@ struct iomap_page {
unsigned long uptodate[];
};
-static inline struct iomap_page *to_iomap_page(struct page *page)
+static inline struct iomap_page *to_iomap_page(struct folio *folio)
{
- /*
- * per-block data is stored in the head page. Callers should
- * not be dealing with tail pages, and if they are, they can
- * call thp_head() first.
- */
- VM_BUG_ON_PGFLAGS(PageTail(page), page);
-
- if (page_has_private(page))
- return (struct iomap_page *)page_private(page);
+ if (folio_test_private(folio))
+ return folio_get_private(folio);
return NULL;
}
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
-iomap_page_create(struct inode *inode, struct page *page)
+iomap_page_create(struct inode *inode, struct folio *folio)
{
- struct iomap_page *iop = to_iomap_page(page);
- unsigned int nr_blocks = i_blocks_per_page(inode, page);
+ struct iomap_page *iop = to_iomap_page(folio);
+ unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
if (iop || nr_blocks <= 1)
return iop;
@@ -60,40 +55,40 @@ iomap_page_create(struct inode *inode, struct page *page)
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
GFP_NOFS | __GFP_NOFAIL);
spin_lock_init(&iop->uptodate_lock);
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks);
- attach_page_private(page, iop);
+ folio_attach_private(folio, iop);
return iop;
}
-static void
-iomap_page_release(struct page *page)
+static void iomap_page_release(struct folio *folio)
{
- struct iomap_page *iop = detach_page_private(page);
- unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
+ struct iomap_page *iop = folio_detach_private(folio);
+ struct inode *inode = folio->mapping->host;
+ unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
if (!iop)
return;
WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
- PageUptodate(page));
+ folio_test_uptodate(folio));
kfree(iop);
}
/*
- * Calculate the range inside the page that we actually need to read.
+ * Calculate the range inside the folio that we actually need to read.
*/
-static void
-iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
- loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
+ loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
{
+ struct iomap_page *iop = to_iomap_page(folio);
loff_t orig_pos = *pos;
loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits;
unsigned block_size = (1 << block_bits);
- unsigned poff = offset_in_page(*pos);
- unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+ size_t poff = offset_in_folio(folio, *pos);
+ size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
@@ -131,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
* page cache for blocks that are entirely outside of i_size.
*/
if (orig_pos <= isize && orig_pos + length > isize) {
- unsigned end = offset_in_page(isize - 1) >> block_bits;
+ unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
plen -= (last - end) * block_size;
@@ -141,80 +136,87 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
*lenp = plen;
}
-static void
-iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+static void iomap_iop_set_range_uptodate(struct folio *folio,
+ struct iomap_page *iop, size_t off, size_t len)
{
- struct iomap_page *iop = to_iomap_page(page);
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned first = off >> inode->i_blkbits;
unsigned last = (off + len - 1) >> inode->i_blkbits;
unsigned long flags;
spin_lock_irqsave(&iop->uptodate_lock, flags);
bitmap_set(iop->uptodate, first, last - first + 1);
- if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
- SetPageUptodate(page);
+ if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&iop->uptodate_lock, flags);
}
-static void
-iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+static void iomap_set_range_uptodate(struct folio *folio,
+ struct iomap_page *iop, size_t off, size_t len)
{
- if (PageError(page))
+ if (folio_test_error(folio))
return;
- if (page_has_private(page))
- iomap_iop_set_range_uptodate(page, off, len);
+ if (iop)
+ iomap_iop_set_range_uptodate(folio, iop, off, len);
else
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
-static void
-iomap_read_page_end_io(struct bio_vec *bvec, int error)
+static void iomap_finish_folio_read(struct folio *folio, size_t offset,
+ size_t len, int error)
{
- struct page *page = bvec->bv_page;
- struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_page *iop = to_iomap_page(folio);
if (unlikely(error)) {
- ClearPageUptodate(page);
- SetPageError(page);
+ folio_clear_uptodate(folio);
+ folio_set_error(folio);
} else {
- iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+ iomap_set_range_uptodate(folio, iop, offset, len);
}
- if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending))
- unlock_page(page);
+ if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
+ folio_unlock(folio);
}
-static void
-iomap_read_end_io(struct bio *bio)
+static void iomap_read_end_io(struct bio *bio)
{
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all)
- iomap_read_page_end_io(bvec, error);
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
bio_put(bio);
}
struct iomap_readpage_ctx {
- struct page *cur_page;
- bool cur_page_in_bio;
+ struct folio *cur_folio;
+ bool cur_folio_in_bio;
struct bio *bio;
struct readahead_control *rac;
};
-static loff_t iomap_read_inline_data(const struct iomap_iter *iter,
- struct page *page)
+/**
+ * iomap_read_inline_data - copy inline data into the page cache
+ * @iter: iteration structure
+ * @folio: folio to copy to
+ *
+ * Copy the inline data in @iter into @folio and zero out the rest of the folio.
+ * Only a single IOMAP_INLINE extent is allowed at the end of each file.
+ * Returns zero for success to complete the read, or the usual negative errno.
+ */
+static int iomap_read_inline_data(const struct iomap_iter *iter,
+ struct folio *folio)
{
+ struct iomap_page *iop;
const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset;
size_t poff = offset_in_page(iomap->offset);
+ size_t offset = offset_in_folio(folio, iomap->offset);
void *addr;
- if (PageUptodate(page))
- return PAGE_SIZE - poff;
+ if (folio_test_uptodate(folio))
+ return 0;
if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
return -EIO;
@@ -223,15 +225,17 @@ static loff_t iomap_read_inline_data(const struct iomap_iter *iter,
return -EIO;
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
- if (poff > 0)
- iomap_page_create(iter->inode, page);
+ if (offset > 0)
+ iop = iomap_page_create(iter->inode, folio);
+ else
+ iop = to_iomap_page(folio);
- addr = kmap_local_page(page) + poff;
+ addr = kmap_local_folio(folio, offset);
memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - poff - size);
kunmap_local(addr);
- iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
- return PAGE_SIZE - poff;
+ iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
+ return 0;
}
static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
@@ -250,36 +254,36 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
loff_t pos = iter->pos + offset;
loff_t length = iomap_length(iter) - offset;
- struct page *page = ctx->cur_page;
+ struct folio *folio = ctx->cur_folio;
struct iomap_page *iop;
loff_t orig_pos = pos;
- unsigned poff, plen;
+ size_t poff, plen;
sector_t sector;
if (iomap->type == IOMAP_INLINE)
- return min(iomap_read_inline_data(iter, page), length);
+ return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, page);
- iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen);
+ iop = iomap_page_create(iter->inode, folio);
+ iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
if (iomap_block_needs_zeroing(iter, pos)) {
- zero_user(page, poff, plen);
- iomap_set_range_uptodate(page, poff, plen);
+ folio_zero_range(folio, poff, plen);
+ iomap_set_range_uptodate(folio, iop, poff, plen);
goto done;
}
- ctx->cur_page_in_bio = true;
+ ctx->cur_folio_in_bio = true;
if (iop)
atomic_add(plen, &iop->read_bytes_pending);
sector = iomap_sector(iomap, pos);
if (!ctx->bio ||
bio_end_sector(ctx->bio) != sector ||
- bio_add_page(ctx->bio, page, plen, poff) != plen) {
- gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ !bio_add_folio(ctx->bio, folio, plen, poff)) {
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
@@ -288,22 +292,24 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
* what do_mpage_readpage does.
*/
- if (!ctx->bio)
- ctx->bio = bio_alloc(orig_gfp, 1);
- ctx->bio->bi_opf = REQ_OP_READ;
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
- bio_set_dev(ctx->bio, iomap->bdev);
ctx->bio->bi_end_io = iomap_read_end_io;
- __bio_add_page(ctx->bio, page, plen, poff);
+ bio_add_folio(ctx->bio, folio, plen, poff);
}
+
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -317,30 +323,31 @@ done:
int
iomap_readpage(struct page *page, const struct iomap_ops *ops)
{
+ struct folio *folio = page_folio(page);
struct iomap_iter iter = {
- .inode = page->mapping->host,
- .pos = page_offset(page),
- .len = PAGE_SIZE,
+ .inode = folio->mapping->host,
+ .pos = folio_pos(folio),
+ .len = folio_size(folio),
};
struct iomap_readpage_ctx ctx = {
- .cur_page = page,
+ .cur_folio = folio,
};
int ret;
- trace_iomap_readpage(page->mapping->host, 1);
+ trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
if (ret < 0)
- SetPageError(page);
+ folio_set_error(folio);
if (ctx.bio) {
submit_bio(ctx.bio);
- WARN_ON_ONCE(!ctx.cur_page_in_bio);
+ WARN_ON_ONCE(!ctx.cur_folio_in_bio);
} else {
- WARN_ON_ONCE(ctx.cur_page_in_bio);
- unlock_page(page);
+ WARN_ON_ONCE(ctx.cur_folio_in_bio);
+ folio_unlock(folio);
}
/*
@@ -359,17 +366,19 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
loff_t done, ret;
for (done = 0; done < length; done += ret) {
- if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) {
- if (!ctx->cur_page_in_bio)
- unlock_page(ctx->cur_page);
- put_page(ctx->cur_page);
- ctx->cur_page = NULL;
+ if (ctx->cur_folio &&
+ offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
+ if (!ctx->cur_folio_in_bio)
+ folio_unlock(ctx->cur_folio);
+ ctx->cur_folio = NULL;
}
- if (!ctx->cur_page) {
- ctx->cur_page = readahead_page(ctx->rac);
- ctx->cur_page_in_bio = false;
+ if (!ctx->cur_folio) {
+ ctx->cur_folio = readahead_folio(ctx->rac);
+ ctx->cur_folio_in_bio = false;
}
ret = iomap_readpage_iter(iter, ctx, done);
+ if (ret <= 0)
+ return ret;
}
return done;
@@ -408,101 +417,105 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
if (ctx.bio)
submit_bio(ctx.bio);
- if (ctx.cur_page) {
- if (!ctx.cur_page_in_bio)
- unlock_page(ctx.cur_page);
- put_page(ctx.cur_page);
+ if (ctx.cur_folio) {
+ if (!ctx.cur_folio_in_bio)
+ folio_unlock(ctx.cur_folio);
}
}
EXPORT_SYMBOL_GPL(iomap_readahead);
/*
- * iomap_is_partially_uptodate checks whether blocks within a page are
+ * iomap_is_partially_uptodate checks whether blocks within a folio are
* uptodate or not.
*
- * Returns true if all blocks which correspond to a file portion
- * we want to read within the page are uptodate.
+ * Returns true if all blocks which correspond to the specified part
+ * of the folio are uptodate.
*/
-int
-iomap_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
- struct iomap_page *iop = to_iomap_page(page);
- struct inode *inode = page->mapping->host;
- unsigned len, first, last;
- unsigned i;
+ struct iomap_page *iop = to_iomap_page(folio);
+ struct inode *inode = folio->mapping->host;
+ unsigned first, last, i;
- /* Limit range to one page */
- len = min_t(unsigned, PAGE_SIZE - from, count);
+ if (!iop)
+ return false;
- /* First and last blocks in range within page */
- first = from >> inode->i_blkbits;
- last = (from + len - 1) >> inode->i_blkbits;
+ /* Caller's range may extend past the end of this folio */
+ count = min(folio_size(folio) - from, count);
- if (iop) {
- for (i = first; i <= last; i++)
- if (!test_bit(i, iop->uptodate))
- return 0;
- return 1;
- }
+ /* First and last blocks in range within folio */
+ first = from >> inode->i_blkbits;
+ last = (from + count - 1) >> inode->i_blkbits;
- return 0;
+ for (i = first; i <= last; i++)
+ if (!test_bit(i, iop->uptodate))
+ return false;
+ return true;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
int
iomap_releasepage(struct page *page, gfp_t gfp_mask)
{
- trace_iomap_releasepage(page->mapping->host, page_offset(page),
- PAGE_SIZE);
+ struct folio *folio = page_folio(page);
+
+ trace_iomap_releasepage(folio->mapping->host, folio_pos(folio),
+ folio_size(folio));
/*
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
* ->releasepage() via shrink_active_list(); skip those here.
*/
- if (PageDirty(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
return 0;
- iomap_page_release(page);
+ iomap_page_release(folio);
return 1;
}
EXPORT_SYMBOL_GPL(iomap_releasepage);
-void
-iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{
- trace_iomap_invalidatepage(page->mapping->host, offset, len);
+ trace_iomap_invalidate_folio(folio->mapping->host,
+ folio_pos(folio) + offset, len);
/*
- * If we're invalidating the entire page, clear the dirty state from it
- * and release it to avoid unnecessary buildup of the LRU.
+ * If we're invalidating the entire folio, clear the dirty state
+ * from it and release it to avoid unnecessary buildup of the LRU.
*/
- if (offset == 0 && len == PAGE_SIZE) {
- WARN_ON_ONCE(PageWriteback(page));
- cancel_dirty_page(page);
- iomap_page_release(page);
+ if (offset == 0 && len == folio_size(folio)) {
+ WARN_ON_ONCE(folio_test_writeback(folio));
+ folio_cancel_dirty(folio);
+ iomap_page_release(folio);
+ } else if (folio_test_large(folio)) {
+ /* Must release the iop so the page can be split */
+ WARN_ON_ONCE(!folio_test_uptodate(folio) &&
+ folio_test_dirty(folio));
+ iomap_page_release(folio);
}
}
-EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
#ifdef CONFIG_MIGRATION
int
iomap_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode)
{
+ struct folio *folio = page_folio(page);
+ struct folio *newfolio = page_folio(newpage);
int ret;
- ret = migrate_page_move_mapping(mapping, newpage, page, 0);
+ ret = folio_migrate_mapping(mapping, newfolio, folio, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (page_has_private(page))
- attach_page_private(newpage, detach_page_private(page));
+ if (folio_test_private(folio))
+ folio_attach_private(newfolio, folio_detach_private(folio));
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(newfolio, folio);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(newfolio, folio);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL_GPL(iomap_migrate_page);
@@ -521,37 +534,35 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
truncate_pagecache_range(inode, max(pos, i_size), pos + len);
}
-static int
-iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
- unsigned plen, const struct iomap *iomap)
+static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
+ size_t poff, size_t plen, const struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
- bio_init(&bio, &bvec, 1);
- bio.bi_opf = REQ_OP_READ;
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_set_dev(&bio, iomap->bdev);
- __bio_add_page(&bio, page, plen, poff);
+ bio_add_folio(&bio, folio, plen, poff);
return submit_bio_wait(&bio);
}
static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
- unsigned len, struct page *page)
+ size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop = iomap_page_create(iter->inode, page);
+ struct iomap_page *iop = iomap_page_create(iter->inode, folio);
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
- unsigned from = offset_in_page(pos), to = from + len, poff, plen;
+ size_t from = offset_in_folio(folio, pos), to = from + len;
+ size_t poff, plen;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return 0;
- ClearPageError(page);
+ folio_clear_error(folio);
do {
- iomap_adjust_read_range(iter->inode, iop, &block_start,
+ iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
if (plen == 0)
break;
@@ -564,39 +575,35 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (iomap_block_needs_zeroing(iter, block_start)) {
if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
return -EIO;
- zero_user_segments(page, poff, from, to, poff + plen);
+ folio_zero_segments(folio, poff, from, to, poff + plen);
} else {
- int status = iomap_read_page_sync(block_start, page,
+ int status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
if (status)
return status;
}
- iomap_set_range_uptodate(page, poff, plen);
+ iomap_set_range_uptodate(folio, iop, poff, plen);
} while ((block_start += plen) < block_end);
return 0;
}
static int iomap_write_begin_inline(const struct iomap_iter *iter,
- struct page *page)
+ struct folio *folio)
{
- int ret;
-
/* needs more work for the tailpacking case; disable for now */
if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
return -EIO;
- ret = iomap_read_inline_data(iter, page);
- if (ret < 0)
- return ret;
- return 0;
+ return iomap_read_inline_data(iter, folio);
}
static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
- unsigned len, struct page **pagep)
+ size_t len, struct folio **foliop)
{
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct page *page;
+ struct folio *folio;
+ unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
@@ -606,35 +613,40 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (fatal_signal_pending(current))
return -EINTR;
+ if (!mapping_large_folio_support(iter->inode->i_mapping))
+ len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+
if (page_ops && page_ops->page_prepare) {
status = page_ops->page_prepare(iter->inode, pos, len);
if (status)
return status;
}
- page = grab_cache_page_write_begin(iter->inode->i_mapping,
- pos >> PAGE_SHIFT, AOP_FLAG_NOFS);
- if (!page) {
+ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ fgp, mapping_gfp_mask(iter->inode->i_mapping));
+ if (!folio) {
status = -ENOMEM;
goto out_no_page;
}
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
if (srcmap->type == IOMAP_INLINE)
- status = iomap_write_begin_inline(iter, page);
+ status = iomap_write_begin_inline(iter, folio);
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
- status = __block_write_begin_int(page, pos, len, NULL, srcmap);
+ status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(iter, pos, len, page);
+ status = __iomap_write_begin(iter, pos, len, folio);
if (unlikely(status))
goto out_unlock;
- *pagep = page;
+ *foliop = folio;
return 0;
out_unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
iomap_write_failed(iter->inode, pos, len);
out_no_page:
@@ -644,9 +656,10 @@ out_no_page:
}
static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
- size_t copied, struct page *page)
+ size_t copied, struct folio *folio)
{
- flush_dcache_page(page);
+ struct iomap_page *iop = to_iomap_page(folio);
+ flush_dcache_folio(folio);
/*
* The blocks that were entirely written will now be uptodate, so we
@@ -659,24 +672,24 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
* non-uptodate page as a zero-length write, and force the caller to
* redo the whole thing.
*/
- if (unlikely(copied < len && !PageUptodate(page)))
+ if (unlikely(copied < len && !folio_test_uptodate(folio)))
return 0;
- iomap_set_range_uptodate(page, offset_in_page(pos), len);
- __set_page_dirty_nobuffers(page);
+ iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+ filemap_dirty_folio(inode->i_mapping, folio);
return copied;
}
static size_t iomap_write_end_inline(const struct iomap_iter *iter,
- struct page *page, loff_t pos, size_t copied)
+ struct folio *folio, loff_t pos, size_t copied)
{
const struct iomap *iomap = &iter->iomap;
void *addr;
- WARN_ON_ONCE(!PageUptodate(page));
+ WARN_ON_ONCE(!folio_test_uptodate(folio));
BUG_ON(!iomap_inline_data_valid(iomap));
- flush_dcache_page(page);
- addr = kmap_local_page(page) + pos;
+ flush_dcache_folio(folio);
+ addr = kmap_local_folio(folio, pos);
memcpy(iomap_inline_data(iomap, pos), addr, copied);
kunmap_local(addr);
@@ -686,7 +699,7 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
- size_t copied, struct page *page)
+ size_t copied, struct folio *folio)
{
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -694,12 +707,12 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t ret;
if (srcmap->type == IOMAP_INLINE) {
- ret = iomap_write_end_inline(iter, page, pos, copied);
+ ret = iomap_write_end_inline(iter, folio, pos, copied);
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
- copied, page, NULL);
+ copied, &folio->page, NULL);
} else {
- ret = __iomap_write_end(iter->inode, pos, len, copied, page);
+ ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
}
/*
@@ -711,13 +724,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
i_size_write(iter->inode, pos + ret);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- unlock_page(page);
+ folio_unlock(folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
if (page_ops && page_ops->page_done)
- page_ops->page_done(iter->inode, pos, ret, page);
- put_page(page);
+ page_ops->page_done(iter->inode, pos, ret, &folio->page);
+ folio_put(folio);
if (ret < len)
iomap_write_failed(iter->inode, pos, len);
@@ -732,6 +745,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
long status = 0;
do {
+ struct folio *folio;
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
@@ -750,21 +764,22 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
- status = iomap_write_begin(iter, pos, bytes, &page);
+ status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
break;
+ page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(iter->inode->i_mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
- status = iomap_write_end(iter, pos, bytes, copied, page);
+ status = iomap_write_end(iter, pos, bytes, copied, folio);
if (unlikely(copied != status))
iov_iter_revert(i, copied - status);
@@ -830,13 +845,13 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
do {
unsigned long offset = offset_in_page(pos);
unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
- struct page *page;
+ struct folio *folio;
- status = iomap_write_begin(iter, pos, bytes, &page);
+ status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
- status = iomap_write_end(iter, pos, bytes, bytes, page);
+ status = iomap_write_end(iter, pos, bytes, bytes, folio);
if (WARN_ON_ONCE(status == 0))
return -EIO;
@@ -870,26 +885,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
-{
- struct page *page;
- int status;
- unsigned offset = offset_in_page(pos);
- unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
-
- status = iomap_write_begin(iter, pos, bytes, &page);
- if (status)
- return status;
-
- zero_user(page, offset, bytes);
- mark_page_accessed(page);
-
- return iomap_write_end(iter, pos, bytes, bytes, page);
-}
-
static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
@@ -900,14 +897,25 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
return length;
do {
- s64 bytes;
+ struct folio *folio;
+ int status;
+ size_t offset;
+ size_t bytes = min_t(u64, SIZE_MAX, length);
- if (IS_DAX(iter->inode))
- bytes = dax_iomap_zero(pos, length, iomap);
- else
- bytes = __iomap_zero_iter(iter, pos, length);
- if (bytes < 0)
- return bytes;
+ status = iomap_write_begin(iter, pos, bytes, &folio);
+ if (status)
+ return status;
+
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
+ folio_zero_range(folio, offset, bytes);
+ folio_mark_accessed(folio);
+
+ bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ if (WARN_ON_ONCE(bytes == 0))
+ return -EIO;
pos += bytes;
length -= bytes;
@@ -951,21 +959,21 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter,
- struct page *page)
+static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
+ struct folio *folio)
{
loff_t length = iomap_length(iter);
int ret;
if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
- ret = __block_write_begin_int(page, iter->pos, length, NULL,
+ ret = __block_write_begin_int(folio, iter->pos, length, NULL,
&iter->iomap);
if (ret)
return ret;
- block_commit_write(page, 0, length);
+ block_commit_write(&folio->page, 0, length);
} else {
- WARN_ON_ONCE(!PageUptodate(page));
- set_page_dirty(page);
+ WARN_ON_ONCE(!folio_test_uptodate(folio));
+ folio_mark_dirty(folio);
}
return length;
@@ -977,44 +985,43 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
};
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
ssize_t ret;
- lock_page(page);
- ret = page_mkwrite_check_truncate(page, iter.inode);
+ folio_lock(folio);
+ ret = folio_mkwrite_check_truncate(folio, iter.inode);
if (ret < 0)
goto out_unlock;
- iter.pos = page_offset(page);
+ iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_page_mkwrite_iter(&iter, page);
+ iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
return VM_FAULT_LOCKED;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-static void
-iomap_finish_page_writeback(struct inode *inode, struct page *page,
- int error, unsigned int len)
+static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+ size_t len, int error)
{
- struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_page *iop = to_iomap_page(folio);
if (error) {
- SetPageError(page);
+ folio_set_error(folio);
mapping_set_error(inode->i_mapping, error);
}
- WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
/*
@@ -1022,7 +1029,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static void
+static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
@@ -1031,10 +1038,10 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
u64 start = bio->bi_iter.bi_sector;
loff_t offset = ioend->io_offset;
bool quiet = bio_flagged(bio, BIO_QUIET);
+ u32 folio_count = 0;
for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -1045,10 +1052,12 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
else
next = bio->bi_private;
- /* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bv, bio, iter_all)
- iomap_finish_page_writeback(inode, bv->bv_page, error,
- bv->bv_len);
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ iomap_finish_folio_write(inode, fi.folio, fi.length,
+ error);
+ folio_count++;
+ }
bio_put(bio);
}
/* The ioend has been freed by bio_put() */
@@ -1058,20 +1067,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
"%s: writeback error on inode %lu, offset %lld, sector %llu",
inode->i_sb->s_id, inode->i_ino, offset, start);
}
+ return folio_count;
}
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
list_replace_init(&ioend->io_list, &tmp);
- iomap_finish_ioend(ioend, error);
+ completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
- iomap_finish_ioend(ioend, error);
+ completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
@@ -1092,6 +1117,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
+ return false;
return true;
}
@@ -1180,11 +1217,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
struct iomap_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
- bio_set_dev(bio, wpc->iomap.bdev);
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
@@ -1193,8 +1229,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
+ ioend->io_folios = 0;
ioend->io_offset = offset;
ioend->io_bio = bio;
+ ioend->io_sector = sector;
return ioend;
}
@@ -1210,11 +1248,9 @@ iomap_chain_bio(struct bio *prev)
{
struct bio *new;
- new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
bio_get(prev); /* for iomap_finish_ioend */
@@ -1235,6 +1271,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
return false;
if (sector != bio_end_sector(wpc->ioend->io_bio))
return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+ return false;
return true;
}
@@ -1243,29 +1286,29 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
* first; otherwise finish off the current ioend and start another.
*/
static void
-iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
+iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct list_head *iolist)
{
- sector_t sector = iomap_sector(&wpc->iomap, offset);
+ sector_t sector = iomap_sector(&wpc->iomap, pos);
unsigned len = i_blocksize(inode);
- unsigned poff = offset & (PAGE_SIZE - 1);
+ size_t poff = offset_in_folio(folio, pos);
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
+ wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
}
- if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) {
+ if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ bio_add_folio(wpc->ioend->io_bio, folio, len, poff);
}
if (iop)
atomic_add(len, &iop->write_bytes_pending);
wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, page, len);
+ wbc_account_cgroup_owner(wbc, &folio->page, len);
}
/*
@@ -1287,44 +1330,45 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
static int
iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
- struct page *page, u64 end_offset)
+ struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, page);
+ struct iomap_page *iop = iomap_page_create(inode, folio);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
- u64 file_offset; /* file offset of page */
+ unsigned nblocks = i_blocks_per_folio(inode, folio);
+ u64 pos = folio_pos(folio);
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
/*
- * Walk through the page to find areas to write back. If we run off the
- * end of the current map or find the current map invalid, grab a new
- * one.
+ * Walk through the folio to find areas to write back. If we
+ * run off the end of the current map or find the current map
+ * invalid, grab a new one.
*/
- for (i = 0, file_offset = page_offset(page);
- i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
- i++, file_offset += len) {
+ for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
if (iop && !test_bit(i, iop->uptodate))
continue;
- error = wpc->ops->map_blocks(wpc, inode, file_offset);
+ error = wpc->ops->map_blocks(wpc, inode, pos);
if (error)
break;
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
continue;
if (wpc->iomap.type == IOMAP_HOLE)
continue;
- iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
+ iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
&submit_list);
count++;
}
+ if (count)
+ wpc->ioend->io_folios++;
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
- WARN_ON_ONCE(!PageLocked(page));
- WARN_ON_ONCE(PageWriteback(page));
- WARN_ON_ONCE(PageDirty(page));
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ WARN_ON_ONCE(folio_test_writeback(folio));
+ WARN_ON_ONCE(folio_test_dirty(folio));
/*
* We cannot cancel the ioend directly here on error. We may have
@@ -1339,17 +1383,17 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* won't be affected by I/O completion and we must unlock it
* now.
*/
- if (wpc->ops->discard_page)
- wpc->ops->discard_page(page, file_offset);
+ if (wpc->ops->discard_folio)
+ wpc->ops->discard_folio(folio, pos);
if (!count) {
- ClearPageUptodate(page);
- unlock_page(page);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
goto done;
}
}
- set_page_writeback(page);
- unlock_page(page);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
/*
* Preserve the original error if there was one; catch
@@ -1370,9 +1414,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* with a partial page truncate on a sub-page block sized filesystem.
*/
if (!count)
- end_page_writeback(page);
+ folio_end_writeback(folio);
done:
- mapping_set_error(page->mapping, error);
+ mapping_set_error(folio->mapping, error);
return error;
}
@@ -1386,16 +1430,15 @@ done:
static int
iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
{
+ struct folio *folio = page_folio(page);
struct iomap_writepage_ctx *wpc = data;
- struct inode *inode = page->mapping->host;
- pgoff_t end_index;
- u64 end_offset;
- loff_t offset;
+ struct inode *inode = folio->mapping->host;
+ u64 end_pos, isize;
- trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE);
+ trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
/*
- * Refuse to write the page out if we're called from reclaim context.
+ * Refuse to write the folio out if we're called from reclaim context.
*
* This avoids stack overflows when called from deeply used stacks in
* random callers for direct reclaim or memcg reclaim. We explicitly
@@ -1409,10 +1452,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
goto redirty;
/*
- * Is this page beyond the end of the file?
+ * Is this folio beyond the end of the file?
*
- * The page index is less than the end_index, adjust the end_offset
- * to the highest offset that this page should represent.
+ * The folio index is less than the end_index, adjust the end_pos
+ * to the highest offset that this folio should represent.
* -----------------------------------------------------
* | file mapping | <EOF> |
* -----------------------------------------------------
@@ -1421,11 +1464,9 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* | desired writeback range | see else |
* ---------------------------------^------------------|
*/
- offset = i_size_read(inode);
- end_index = offset >> PAGE_SHIFT;
- if (page->index < end_index)
- end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
- else {
+ isize = i_size_read(inode);
+ end_pos = folio_pos(folio) + folio_size(folio);
+ if (end_pos > isize) {
/*
* Check whether the page to write out is beyond or straddles
* i_size or not.
@@ -1437,7 +1478,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* | | Straddles |
* ---------------------------------^-----------|--------|
*/
- unsigned offset_into_page = offset & (PAGE_SIZE - 1);
+ size_t poff = offset_in_folio(folio, isize);
+ pgoff_t end_index = isize >> PAGE_SHIFT;
/*
* Skip the page if it's fully outside i_size, e.g. due to a
@@ -1456,8 +1498,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* checking if the page is totally beyond i_size or if its
* offset is just equal to the EOF.
*/
- if (page->index > end_index ||
- (page->index == end_index && offset_into_page == 0))
+ if (folio->index > end_index ||
+ (folio->index == end_index && poff == 0))
goto redirty;
/*
@@ -1468,17 +1510,15 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
- zero_user_segment(page, offset_into_page, PAGE_SIZE);
-
- /* Adjust the end_offset to the end of file */
- end_offset = offset;
+ folio_zero_segment(folio, poff, folio_size(folio));
+ end_pos = isize;
}
- return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
+ return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
redirty:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
return 0;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..b08f5dc31780 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -6,6 +6,8 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/fscrypt.h>
+#include <linux/pagemap.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
@@ -31,6 +33,7 @@ struct iomap_dio {
atomic_t ref;
unsigned flags;
int error;
+ size_t done_before;
bool wait_for_completion;
union {
@@ -38,8 +41,7 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct request_queue *last_queue;
- blk_qc_t cookie;
+ struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -49,29 +51,20 @@ struct iomap_dio {
};
};
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
- struct request_queue *q = READ_ONCE(kiocb->private);
-
- if (!q)
- return 0;
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI)
+ if (dio->iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, dio->iocb);
+ dio->submit.poll_bio = bio;
+ }
- dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+ dio->dops->submit_io(iter, bio, pos);
else
- dio->submit.cookie = submit_bio(bio);
+ submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -124,6 +117,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
+ if (ret > 0)
+ ret += dio->done_before;
+
kfree(dio);
return ret;
@@ -135,7 +131,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio));
}
/*
@@ -164,9 +160,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
+ WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
+ WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
}
@@ -182,19 +180,20 @@ static void iomap_dio_bio_end_io(struct bio *bio)
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, iter->iomap.bdev);
+ bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
@@ -282,6 +281,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (!iov_iter_count(dio->submit.iter))
goto out;
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
@@ -305,14 +311,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(GFP_KERNEL, nr_pages);
- bio_set_dev(bio, iomap->bdev);
+ bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
- bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
@@ -339,6 +344,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (nr_pages)
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -371,6 +381,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
+ if (!length)
+ return -EFAULT;
return length;
}
@@ -402,6 +414,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
+ if (!copied)
+ return -EFAULT;
return copied;
}
@@ -446,13 +460,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
* may be pure data writes. In that case, we still need to do a full data sync
* completion.
*
+ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
+ * __iomap_dio_rw can return a partial result if it encounters a non-resident
+ * page in @iter after preparing a transfer. In that case, the non-resident
+ * pages can be faulted in and the request resumed with @done_before set to the
+ * number of bytes previously transferred. The request will then complete with
+ * the correct total number of bytes transferred; this is essential for
+ * completing partial requests asynchronously.
+ *
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -482,11 +504,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->dops = dops;
dio->error = 0;
dio->flags = 0;
+ dio->done_before = done_before;
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
+ dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
@@ -565,8 +587,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- while ((ret = iomap_iter(&iomi, ops)) > 0)
+ while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
+
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ iocb->ki_flags &= ~IOCB_HIPRI;
+ }
+
blk_finish_plug(&plug);
/*
@@ -577,6 +606,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
iov_iter_revert(iter, iomi.pos - dio->i_size);
+ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
+ if (!(iocb->ki_flags & IOCB_NOWAIT))
+ wait_for_completion = true;
+ ret = 0;
+ }
+
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
wait_for_completion = true;
@@ -592,8 +627,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
- WRITE_ONCE(iocb->private, dio->submit.last_queue);
+ WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
@@ -620,10 +654,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !dio->submit.last_queue ||
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie, true))
+ if (!dio->submit.poll_bio ||
+ !bio_poll(dio->submit.poll_bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -642,11 +674,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 66cf267c68ae..610ca6f1ec9b 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/pagemap.h>
static int iomap_to_fiemap(struct fiemap_extent_info *fi,
const struct iomap *iomap, u32 flags)
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 65e39785c284..a6689a563c6e 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -81,7 +81,7 @@ DEFINE_EVENT(iomap_range_class, name, \
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_releasepage);
-DEFINE_RANGE_EVENT(iomap_invalidatepage);
+DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
#define IOMAP_TYPE_STRINGS \
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 678e2c51b855..d7491692aea3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep;
static struct inode *isofs_alloc_inode(struct super_block *sb)
{
struct iso_inode_info *ei;
- ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
de = (struct iso_directory_record *) (bh->b_data + offset);
de_len = *(unsigned char *) de;
+ if (de_len < sizeof(struct iso_directory_record))
+ goto fail;
if (offset + de_len > bufsize) {
int frag1 = bufsize - offset;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3cc4ab2ba7f4..ac7f067b7bdd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -484,24 +484,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
- spin_lock(&commit_transaction->t_handle_lock);
- while (atomic_read(&commit_transaction->t_updates)) {
- DEFINE_WAIT(wait);
+ // waits for any t_updates to finish
+ jbd2_journal_wait_updates(journal);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&commit_transaction->t_updates)) {
- spin_unlock(&commit_transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- write_lock(&journal->j_state_lock);
- spin_lock(&commit_transaction->t_handle_lock);
- }
- finish_wait(&journal->j_wait_updates, &wait);
- }
- spin_unlock(&commit_transaction->t_handle_lock);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -521,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -540,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
@@ -817,7 +806,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
- /*
+ /*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
* the commit record
@@ -1170,7 +1159,7 @@ restart_loop:
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
if (journal->j_fc_cleanup_callback)
- journal->j_fc_cleanup_callback(journal, 1);
+ journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 35302bc192eb..fcacafa4510d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
-EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
@@ -757,6 +757,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
+ jbd2_journal_lock_updates(journal);
return 0;
}
@@ -768,8 +769,9 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
*/
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
+ jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback)
- journal->j_fc_cleanup_callback(journal, 0);
+ journal->j_fc_cleanup_callback(journal, 0, tid);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
@@ -1210,7 +1212,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
- journal_t *journal = PDE_DATA(inode);
+ journal_t *journal = pde_data(inode);
struct jbd2_stats_proc_session *s;
int rc, size;
@@ -1285,6 +1287,8 @@ static int jbd2_min_tag_size(void)
/**
* jbd2_journal_shrink_scan()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
*
* Scan the checkpointed buffer on the checkpoint list and release the
* journal_head.
@@ -1310,6 +1314,8 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
/**
* jbd2_journal_shrink_count()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
*
* Count the number of checkpoint buffers on the checkpoint list.
*/
@@ -2970,6 +2976,7 @@ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
jbd_unlock_bh_journal_head(bh);
return jh;
}
+EXPORT_SYMBOL(jbd2_journal_grab_journal_head);
static void __journal_remove_journal_head(struct buffer_head *bh)
{
@@ -3022,6 +3029,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
jbd_unlock_bh_journal_head(bh);
}
}
+EXPORT_SYMBOL(jbd2_journal_put_journal_head);
/*
* Initialize jbd inode head
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6a3caedd2285..fcb9175016a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -107,7 +107,6 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
- spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) +
@@ -139,26 +138,22 @@ static void jbd2_get_transaction(journal_t *journal,
/*
* Update transaction's maximum wait time, if debugging is enabled.
*
- * In order for t_max_wait to be reliable, it must be protected by a
- * lock. But doing so will mean that start_this_handle() can not be
- * run in parallel on SMP systems, which limits our scalability. So
- * unless debugging is enabled, we no longer update t_max_wait, which
- * means that maximum wait time reported by the jbd2_run_stats
- * tracepoint will always be zero.
+ * t_max_wait is carefully updated here with use of atomic compare exchange.
+ * Note that there could be multiplre threads trying to do this simultaneously
+ * hence using cmpxchg to avoid any use of locks in this case.
+ * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{
-#ifdef CONFIG_JBD2_DEBUG
- if (jbd2_journal_enable_debug &&
- time_after(transaction->t_start, ts)) {
- ts = jbd2_time_diff(ts, transaction->t_start);
- spin_lock(&transaction->t_handle_lock);
- if (ts > transaction->t_max_wait)
- transaction->t_max_wait = ts;
- spin_unlock(&transaction->t_handle_lock);
+ unsigned long oldts, newts;
+
+ if (time_after(transaction->t_start, ts)) {
+ newts = jbd2_time_diff(ts, transaction->t_start);
+ oldts = READ_ONCE(transaction->t_max_wait);
+ while (oldts < newts)
+ oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
}
-#endif
}
/*
@@ -449,7 +444,7 @@ repeat:
}
/* OK, account for the buffers that this operation expects to
- * use and add the handle to the running transaction.
+ * use and add the handle to the running transaction.
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
@@ -690,7 +685,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
DIV_ROUND_UP(
handle->h_revoke_credits_requested,
journal->j_revoke_records_per_block);
- spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -698,7 +692,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
+ goto error_out;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
@@ -714,8 +708,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
- spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
return result;
@@ -836,6 +828,43 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
}
EXPORT_SYMBOL(jbd2_journal_restart);
+/*
+ * Waits for any outstanding t_updates to finish.
+ * This is called with write j_state_lock held.
+ */
+void jbd2_journal_wait_updates(journal_t *journal)
+{
+ DEFINE_WAIT(wait);
+
+ while (1) {
+ /*
+ * Note that the running transaction can get freed under us if
+ * this transaction is getting committed in
+ * jbd2_journal_commit_transaction() ->
+ * jbd2_journal_free_transaction(). This can only happen when we
+ * release j_state_lock -> schedule() -> acquire j_state_lock.
+ * Hence we should everytime retrieve new j_running_transaction
+ * value (after j_state_lock release acquire cycle), else it may
+ * lead to use-after-free of old freed transaction.
+ */
+ transaction_t *transaction = journal->j_running_transaction;
+
+ if (!transaction)
+ break;
+
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&transaction->t_updates)) {
+ finish_wait(&journal->j_wait_updates, &wait);
+ break;
+ }
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_updates, &wait);
+ write_lock(&journal->j_state_lock);
+ }
+}
+
/**
* jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
@@ -848,8 +877,6 @@ EXPORT_SYMBOL(jbd2_journal_restart);
*/
void jbd2_journal_lock_updates(journal_t *journal)
{
- DEFINE_WAIT(wait);
-
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
@@ -863,27 +890,9 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
}
- /* Wait until there are no running updates */
- while (1) {
- transaction_t *transaction = journal->j_running_transaction;
-
- if (!transaction)
- break;
+ /* Wait until there are no running t_updates */
+ jbd2_journal_wait_updates(journal);
- spin_lock(&transaction->t_handle_lock);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&transaction->t_updates)) {
- spin_unlock(&transaction->t_handle_lock);
- finish_wait(&journal->j_wait_updates, &wait);
- break;
- }
- spin_unlock(&transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_updates, &wait);
- write_lock(&journal->j_state_lock);
- }
write_unlock(&journal->j_state_lock);
/*
@@ -2208,14 +2217,14 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
}
/*
- * jbd2_journal_invalidatepage
+ * jbd2_journal_invalidate_folio
*
* This code is tricky. It has a number of cases to deal with.
*
* There are two invariants which this code relies on:
*
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
+ * i_size must be updated on disk before we start calling invalidate_folio
+ * on the data.
*
* This is done in ext3 by defining an ext3_setattr method which
* updates i_size before truncate gets going. By maintaining this
@@ -2417,9 +2426,9 @@ zap_buffer_unlocked:
}
/**
- * jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidate_folio()
* @journal: journal to use for flush...
- * @page: page to flush
+ * @folio: folio to flush
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
@@ -2428,30 +2437,29 @@ zap_buffer_unlocked:
* the page is straddling i_size. Caller then has to wait for current commit
* and try again.
*/
-int jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned int offset,
- unsigned int length)
+int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
+ size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int may_free = 1;
int ret = 0;
- if (!PageLocked(page))
+ if (!folio_test_locked(folio))
BUG();
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return 0;
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
- head = bh = page_buffers(page);
+ bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
@@ -2474,8 +2482,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(page))
- J_ASSERT(!page_has_buffers(page));
+ if (may_free && try_to_free_buffers(&folio->page))
+ J_ASSERT(!folio_buffers(folio));
}
return 0;
}
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 2b4d5013dc5d..6da92ecaf66d 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -161,5 +161,5 @@ static int jffs2_garbage_collect_thread(void *_c)
spin_lock(&c->erase_completion_lock);
c->gc_task = NULL;
spin_unlock(&c->erase_completion_lock);
- complete_and_exit(&c->gc_thread_exit, 0);
+ kthread_complete_and_exit(&c->gc_thread_exit, 0);
}
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index b288c8ae1236..837cd55fd4c5 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
ret = -EIO;
- goto out_free;
+ goto out_sum_exit;
}
jffs2_calc_trigger_levels(c);
return 0;
+ out_sum_exit:
+ jffs2_sum_exit(c);
out_free:
kvfree(c->blocks);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 4fc8cd698d1a..bd7d58d27bfc 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -136,20 +136,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
pgoff_t index = pos >> PAGE_SHIFT;
uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
- pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg)
- return -ENOMEM;
- *pagep = pg;
-
jffs2_dbg(1, "%s()\n", __func__);
if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
uint32_t alloc_len;
@@ -160,7 +155,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
if (ret)
- goto out_page;
+ goto out_err;
mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
@@ -190,7 +185,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
mutex_unlock(&f->sem);
- goto out_page;
+ goto out_err;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
if (f->metadata) {
@@ -205,7 +200,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
mutex_unlock(&f->sem);
- goto out_page;
+ goto out_err;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
@@ -213,6 +208,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
}
/*
+ * While getting a page and reading data in, lock c->alloc_sem until
+ * the page is Uptodate. Otherwise GC task may attempt to read the same
+ * page in read_cache_page(), which causes a deadlock.
+ */
+ mutex_lock(&c->alloc_sem);
+ pg = grab_cache_page_write_begin(mapping, index, flags);
+ if (!pg) {
+ ret = -ENOMEM;
+ goto release_sem;
+ }
+ *pagep = pg;
+
+ /*
* Read in the page if it wasn't already present. Cannot optimize away
* the whole page write case until jffs2_write_end can handle the
* case of a short-copy.
@@ -221,15 +229,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
mutex_unlock(&f->sem);
- if (ret)
- goto out_page;
+ if (ret) {
+ unlock_page(pg);
+ put_page(pg);
+ goto release_sem;
+ }
}
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
- return ret;
-out_page:
- unlock_page(pg);
- put_page(pg);
+release_sem:
+ mutex_unlock(&c->alloc_sem);
+out_err:
return ret;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2ac410477c4f..71f03a5d36ed 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -603,8 +603,8 @@ out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
- out_inohash:
jffs2_clear_xattr_subsystem(c);
+ out_inohash:
kfree(c->inocache_list);
out_wbuf:
jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 2e4a86763c07..93a2951538ce 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -18,11 +18,11 @@
#include <linux/mutex.h>
struct jffs2_inode_info {
- /* We need an internal mutex similar to inode->i_mutex.
+ /* We need an internal mutex similar to inode->i_rwsem.
Unfortunately, we can't used the existing one, because
either the GC would deadlock, or we'd have to release it
before letting GC proceed. Or we'd have to put ugliness
- into the GC code so it didn't attempt to obtain the i_mutex
+ into the GC code so it didn't attempt to obtain the i_rwsem
for the inode(s) which are already locked */
struct mutex sem;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b676056826be..29671e33a171 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
if (!s) {
JFFS2_WARNING("Can't allocate memory for summary\n");
ret = -ENOMEM;
- goto out;
+ goto out_buf;
}
}
@@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
}
ret = 0;
out:
+ jffs2_sum_reset_collected(s);
+ kfree(s);
+ out_buf:
if (buf_size)
kfree(flashbuf);
#ifndef __ECOS
else
mtd_unpoint(c->mtd, 0, c->mtd->size);
#endif
- kfree(s);
return ret;
}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 81ca58c10b72..7ea37f49f1e1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
{
struct jffs2_inode_info *f;
- f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
+ f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL);
if (!f)
return NULL;
return &f->vfs_inode;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 57ab424c05ff..d1943a7b4b04 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode)
dquot_initialize(inode);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+ struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
- if (JFS_SBI(inode->i_sb)->ipimap)
+ if (ipimap && JFS_IP(ipimap)->i_imap)
diFree(inode);
/*
@@ -357,7 +358,8 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations jfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = jfs_readpage,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 91f4ec93dab1..d8502f4989d9 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -148,6 +148,7 @@ static const s8 budtab[256] = {
* 0 - success
* -ENOMEM - insufficient memory
* -EIO - i/o error
+ * -EINVAL - wrong bmap data
*/
int dbMount(struct inode *ipbmap)
{
@@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap)
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ if (!bmp->db_numag) {
+ release_metapage(mp);
+ kfree(bmp);
+ return -EINVAL;
+ }
+
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 78fd136ac13b..997c81fcea34 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(GFP_NOFS, 1);
-
+ bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 176580f54af9..c4220ccdedef 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -416,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
}
len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS);
bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
/* Don't call bio_add_page yet, we may add to this vec */
bio_offset = offset;
@@ -496,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page)
if (bio)
submit_bio(bio);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ,
+ GFP_NOFS);
bio->bi_iter.bi_sector =
pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
len = xlen << inode->i_blkbits;
offset = block_offset << inode->i_blkbits;
if (bio_add_page(bio, page, len, offset) < len)
@@ -554,22 +552,22 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void metapage_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- BUG_ON(offset || length < PAGE_SIZE);
+ BUG_ON(offset || length < folio_size(folio));
- BUG_ON(PageWriteback(page));
+ BUG_ON(folio_test_writeback(folio));
- metapage_releasepage(page, 0);
+ metapage_releasepage(&folio->page, 0);
}
const struct address_space_operations jfs_metapage_aops = {
.readpage = metapage_readpage,
.writepage = metapage_writepage,
.releasepage = metapage_releasepage,
- .invalidatepage = metapage_invalidatepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .invalidate_folio = metapage_invalidate_folio,
+ .dirty_folio = filemap_dirty_folio,
};
struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 5d7d7170c03c..aa4ff7bcaff2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -81,14 +81,14 @@ int jfs_mount(struct super_block *sb)
* (initialize mount inode from the superblock)
*/
if ((rc = chkSuper(sb))) {
- goto errout20;
+ goto out;
}
ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
if (ipaimap == NULL) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout20;
+ goto out;
}
sbi->ipaimap = ipaimap;
@@ -99,7 +99,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = diMount(ipaimap))) {
jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
- goto errout21;
+ goto err_ipaimap;
}
/*
@@ -108,7 +108,7 @@ int jfs_mount(struct super_block *sb)
ipbmap = diReadSpecial(sb, BMAP_I, 0);
if (ipbmap == NULL) {
rc = -EIO;
- goto errout22;
+ goto err_umount_ipaimap;
}
jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
@@ -120,7 +120,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = dbMount(ipbmap))) {
jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
- goto errout22;
+ goto err_ipbmap;
}
/*
@@ -139,7 +139,7 @@ int jfs_mount(struct super_block *sb)
if (!ipaimap2) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout35;
+ goto err_umount_ipbmap;
}
sbi->ipaimap2 = ipaimap2;
@@ -151,7 +151,7 @@ int jfs_mount(struct super_block *sb)
if ((rc = diMount(ipaimap2))) {
jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
rc);
- goto errout35;
+ goto err_ipaimap2;
}
} else
/* Secondary aggregate inode table is not valid */
@@ -168,7 +168,7 @@ int jfs_mount(struct super_block *sb)
jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
/* open fileset secondary inode allocation map */
rc = -EIO;
- goto errout40;
+ goto err_umount_ipaimap2;
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
@@ -178,41 +178,34 @@ int jfs_mount(struct super_block *sb)
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
- goto errout41;
+ goto err_ipimap;
}
- goto out;
+ return rc;
/*
* unwind on error
*/
- errout41: /* close fileset inode allocation map inode */
+err_ipimap:
+ /* close fileset inode allocation map inode */
diFreeSpecial(ipimap);
-
- errout40: /* fileset closed */
-
+err_umount_ipaimap2:
/* close secondary aggregate inode allocation map */
- if (ipaimap2) {
+ if (ipaimap2)
diUnmount(ipaimap2, 1);
+err_ipaimap2:
+ /* close aggregate inodes */
+ if (ipaimap2)
diFreeSpecial(ipaimap2);
- }
-
- errout35:
-
- /* close aggregate block allocation map */
+err_umount_ipbmap: /* close aggregate block allocation map */
dbUnmount(ipbmap, 1);
+err_ipbmap: /* close aggregate inodes */
diFreeSpecial(ipbmap);
-
- errout22: /* close aggregate inode allocation map */
-
+err_umount_ipaimap: /* close aggregate inode allocation map */
diUnmount(ipaimap, 1);
-
- errout21: /* close aggregate inodes */
+err_ipaimap: /* close aggregate inodes */
diFreeSpecial(ipaimap);
- errout20: /* aggregate closed */
-
- out:
-
+out:
if (rc)
jfs_err("Mount JFS Failure: %d", rc);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index bde787c354fc..8b9a72ae5efa 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto out;
}
- VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+ VolumeSize = sb_bdev_nr_blocks(sb);
if (VolumeSize) {
if (newLVSize > VolumeSize) {
printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
txQuiesce(sb);
/* Reset size of direct inode */
- sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
if (sbi->mntflag & JFS_INLINELOG) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9030aeaf0f88..f1a13a74cddf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
{
struct jfs_inode_info *jfs_inode;
- jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+ jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS);
if (!jfs_inode)
return NULL;
#ifdef CONFIG_QUOTA
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_resize_nosize:
{
- *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ *newLVSize = sb_bdev_nr_blocks(sb);
if (*newLVSize == 0)
pr_err("JFS: Cannot determine volume size\n");
break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ inode->i_size = bdev_nr_bytes(sb->s_bdev);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8e0a1378a4b1..e205fde7163a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,6 @@
#include "kernfs-internal.h"
-DECLARE_RWSEM(kernfs_rwsem);
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
@@ -26,7 +25,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static bool kernfs_active(struct kernfs_node *kn)
{
- lockdep_assert_held(&kernfs_rwsem);
+ lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
return atomic_read(&kn->active) >= 0;
}
@@ -457,14 +456,15 @@ void kernfs_put_active(struct kernfs_node *kn)
* return after draining is complete.
*/
static void kernfs_drain(struct kernfs_node *kn)
- __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem)
+ __releases(&kernfs_root(kn)->kernfs_rwsem)
+ __acquires(&kernfs_root(kn)->kernfs_rwsem)
{
struct kernfs_root *root = kernfs_root(kn);
- lockdep_assert_held_write(&kernfs_rwsem);
+ lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) {
rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn)
kernfs_drain_open_files(kn);
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
}
/**
@@ -718,11 +718,12 @@ err_unlock:
int kernfs_add_one(struct kernfs_node *kn)
{
struct kernfs_node *parent = kn->parent;
+ struct kernfs_root *root = kernfs_root(parent);
struct kernfs_iattrs *ps_iattr;
bool has_ns;
int ret;
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
@@ -753,7 +754,7 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -767,7 +768,7 @@ int kernfs_add_one(struct kernfs_node *kn)
return 0;
out_unlock:
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
return ret;
}
@@ -788,7 +789,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
bool has_ns = kernfs_ns_enabled(parent);
unsigned int hash;
- lockdep_assert_held(&kernfs_rwsem);
+ lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
@@ -820,7 +821,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
size_t len;
char *p, *name;
- lockdep_assert_held_read(&kernfs_rwsem);
+ lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
spin_lock_irq(&kernfs_rename_lock);
@@ -859,11 +860,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root = kernfs_root(parent);
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return kn;
}
@@ -883,11 +885,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root = kernfs_root(parent);
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return kn;
}
@@ -912,6 +915,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);
idr_init(&root->ino_idr);
+ init_rwsem(&root->kernfs_rwsem);
INIT_LIST_HEAD(&root->supers);
/*
@@ -957,7 +961,22 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
*/
void kernfs_destroy_root(struct kernfs_root *root)
{
- kernfs_remove(root->kn); /* will also free @root */
+ /*
+ * kernfs_remove holds kernfs_rwsem from the root so the root
+ * shouldn't be freed during the operation.
+ */
+ kernfs_get(root->kn);
+ kernfs_remove(root->kn);
+ kernfs_put(root->kn); /* will also free @root */
+}
+
+/**
+ * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
+ * @root: root to use to lookup
+ */
+struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
+{
+ return root->kn;
}
/**
@@ -1035,6 +1054,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
{
struct kernfs_node *kn;
+ struct kernfs_root *root;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -1046,18 +1066,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
*/
- down_read(&kernfs_rwsem);
spin_lock(&dentry->d_lock);
parent = kernfs_dentry_node(dentry->d_parent);
if (parent) {
+ spin_unlock(&dentry->d_lock);
+ root = kernfs_root(parent);
+ down_read(&root->kernfs_rwsem);
if (kernfs_dir_changed(parent, dentry)) {
- spin_unlock(&dentry->d_lock);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return 0;
}
- }
- spin_unlock(&dentry->d_lock);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
+ } else
+ spin_unlock(&dentry->d_lock);
/* The kernfs parent node hasn't changed, leave the
* dentry negative and return success.
@@ -1066,7 +1087,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
}
kn = kernfs_dentry_node(dentry);
- down_read(&kernfs_rwsem);
+ root = kernfs_root(kn);
+ down_read(&root->kernfs_rwsem);
/* The kernfs node has been deactivated */
if (!kernfs_active(kn))
@@ -1085,10 +1107,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return 1;
out_bad:
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return 0;
}
@@ -1102,10 +1124,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_node *kn;
+ struct kernfs_root *root;
struct inode *inode = NULL;
const void *ns = NULL;
- down_read(&kernfs_rwsem);
+ root = kernfs_root(parent);
+ down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;
@@ -1116,7 +1140,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
* create a negative.
*/
if (!kernfs_active(kn)) {
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return NULL;
}
inode = kernfs_get_inode(dir->i_sb, kn);
@@ -1131,7 +1155,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
*/
if (!IS_ERR(inode))
kernfs_set_rev(parent, dentry);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
/* instantiate and hash (possibly negative) dentry */
return d_splice_alias(inode, dentry);
@@ -1254,7 +1278,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
{
struct rb_node *rbn;
- lockdep_assert_held_write(&kernfs_rwsem);
+ lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
/* if first iteration, visit leftmost descendant which may be root */
if (!pos)
@@ -1289,8 +1313,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
void kernfs_activate(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1304,14 +1329,14 @@ void kernfs_activate(struct kernfs_node *kn)
pos->flags |= KERNFS_ACTIVATED;
}
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
}
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
- lockdep_assert_held_write(&kernfs_rwsem);
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
/*
* Short-circuit if non-root @kn has already finished removal.
@@ -1381,9 +1406,16 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- down_write(&kernfs_rwsem);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
+
+ down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
}
/**
@@ -1469,8 +1501,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
bool kernfs_remove_self(struct kernfs_node *kn)
{
bool ret;
+ struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
kernfs_break_active_protection(kn);
/*
@@ -1498,9 +1531,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
schedule();
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1513,7 +1546,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
*/
kernfs_unbreak_active_protection(kn);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
return ret;
}
@@ -1530,6 +1563,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root;
if (!parent) {
WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
@@ -1537,13 +1571,14 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
return -ENOENT;
}
- down_write(&kernfs_rwsem);
+ root = kernfs_root(parent);
+ down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
if (kn)
__kernfs_remove(kn);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
if (kn)
return 0;
@@ -1562,6 +1597,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
{
struct kernfs_node *old_parent;
+ struct kernfs_root *root;
const char *old_name = NULL;
int error;
@@ -1569,7 +1605,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
if (!kn->parent)
return -EINVAL;
- down_write(&kernfs_rwsem);
+ root = kernfs_root(kn);
+ down_write(&root->kernfs_rwsem);
error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
@@ -1623,7 +1660,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
error = 0;
out:
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
return error;
}
@@ -1694,11 +1731,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
struct dentry *dentry = file->f_path.dentry;
struct kernfs_node *parent = kernfs_dentry_node(dentry);
struct kernfs_node *pos = file->private_data;
+ struct kernfs_root *root;
const void *ns = NULL;
if (!dir_emit_dots(file, ctx))
return 0;
- down_read(&kernfs_rwsem);
+
+ root = kernfs_root(parent);
+ down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
@@ -1715,12 +1755,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
file->private_data = pos;
kernfs_get(pos);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
}
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 60e2a86c535e..88423069407c 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -120,13 +120,8 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
if (next == ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, next);
return next;
- } else {
- /*
- * The same behavior and code as single_open(). Returns
- * !NULL if pos is at the beginning; otherwise, NULL.
- */
- return NULL + !*ppos;
}
+ return single_start(sf, ppos);
}
static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
@@ -847,6 +842,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
struct kernfs_super_info *info;
+ struct kernfs_root *root;
repeat:
/* pop one off the notify_list */
spin_lock_irq(&kernfs_notify_lock);
@@ -859,8 +855,9 @@ repeat:
kn->attr.notify_next = NULL;
spin_unlock_irq(&kernfs_notify_lock);
+ root = kernfs_root(kn);
/* kick fsnotify */
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
@@ -898,7 +895,7 @@ repeat:
iput(inode);
}
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
kernfs_put(kn);
goto repeat;
}
@@ -1000,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
#endif
/*
- * kn->attr.ops is accesible only while holding active ref. We
+ * kn->attr.ops is accessible only while holding active ref. We
* need to know whether some ops are implemented outside active
* ref. Cache their existence in flags.
*/
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index c0eae1725435..3d783d80f5da 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
+ struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
return ret;
}
@@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct kernfs_node *kn = inode->i_private;
+ struct kernfs_root *root;
int error;
if (!kn)
return -EINVAL;
- down_write(&kernfs_rwsem);
+ root = kernfs_root(kn);
+ down_write(&root->kernfs_rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);
out:
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
return error;
}
@@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
{
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
+ struct kernfs_root *root = kernfs_root(kn);
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return 0;
}
@@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct kernfs_node *kn;
+ struct kernfs_root *root;
int ret;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
kn = inode->i_private;
+ root = kernfs_root(kn);
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index f9cc912c31e1..eeaa779b929c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -31,6 +31,24 @@ struct kernfs_iattrs {
atomic_t user_xattr_size;
};
+struct kernfs_root {
+ /* published fields */
+ struct kernfs_node *kn;
+ unsigned int flags; /* KERNFS_ROOT_* flags */
+
+ /* private fields, do not use outside kernfs proper */
+ struct idr ino_idr;
+ u32 last_id_lowbits;
+ u32 id_highbits;
+ struct kernfs_syscall_ops *syscall_ops;
+
+ /* list of kernfs_super_info of this root, protected by kernfs_rwsem */
+ struct list_head supers;
+
+ wait_queue_head_t deactivate_waitq;
+ struct rw_semaphore kernfs_rwsem;
+};
+
/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
@@ -122,7 +140,6 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
/*
* dir.c
*/
-extern struct rw_semaphore kernfs_rwsem;
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f2f909d09f52..cfa79715fc1a 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
struct kernfs_super_info *info = kernfs_info(sb);
+ struct kernfs_root *kf_root = kfc->root;
struct inode *inode;
struct dentry *root;
@@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_shrink.seeks = 0;
/* get root inode, initialize and unlock it */
- down_read(&kernfs_rwsem);
+ down_read(&kf_root->kernfs_rwsem);
inode = kernfs_get_inode(sb, info->root->kn);
- up_read(&kernfs_rwsem);
+ up_read(&kf_root->kernfs_rwsem);
if (!inode) {
pr_debug("kernfs: could not get root inode\n");
return -ENOMEM;
@@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc)
if (!sb->s_root) {
struct kernfs_super_info *info = kernfs_info(sb);
+ struct kernfs_root *root = kfc->root;
kfc->new_sb_created = true;
@@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
list_add(&info->node, &info->root->supers);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
}
fc->root = dget(sb->s_root);
@@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc)
void kernfs_kill_sb(struct super_block *sb)
{
struct kernfs_super_info *info = kernfs_info(sb);
+ struct kernfs_root *root = info->root;
- down_write(&kernfs_rwsem);
+ down_write(&root->kernfs_rwsem);
list_del(&info->node);
- up_write(&kernfs_rwsem);
+ up_write(&root->kernfs_rwsem);
/*
* Remove the superblock from fs_supers/s_instances
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index c8f8e41b8411..0ab13824822f 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -36,8 +36,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
gid = target->iattr->ia_gid;
}
- kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid,
- KERNFS_LINK);
+ kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK);
if (!kn)
return ERR_PTR(-ENOMEM);
@@ -114,11 +113,12 @@ static int kernfs_getlink(struct inode *inode, char *path)
struct kernfs_node *kn = inode->i_private;
struct kernfs_node *parent = kn->parent;
struct kernfs_node *target = kn->symlink.target_kn;
+ struct kernfs_root *root = kernfs_root(parent);
int error;
- down_read(&kernfs_rwsem);
+ down_read(&root->kernfs_rwsem);
error = kernfs_get_target_path(parent, target, path);
- up_read(&kernfs_rwsem);
+ up_read(&root->kernfs_rwsem);
return error;
}
diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig
index b83cbd756ae5..e1fe17747ed6 100644
--- a/fs/ksmbd/Kconfig
+++ b/fs/ksmbd/Kconfig
@@ -6,7 +6,6 @@ config SMB_SERVER
select NLS
select NLS_UTF8
select CRYPTO
- select CRYPTO_MD4
select CRYPTO_MD5
select CRYPTO_HMAC
select CRYPTO_ECB
@@ -19,6 +18,7 @@ config SMB_SERVER
select CRYPTO_GCM
select ASN1
select OID_REGISTRY
+ select CRC32
default n
help
Choose Y here if you want to allow SMB3 compliant clients
diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c
index b014f4638610..c03eba090368 100644
--- a/fs/ksmbd/asn1.c
+++ b/fs/ksmbd/asn1.c
@@ -21,101 +21,11 @@
#include "ksmbd_spnego_negtokeninit.asn1.h"
#include "ksmbd_spnego_negtokentarg.asn1.h"
-#define SPNEGO_OID_LEN 7
#define NTLMSSP_OID_LEN 10
-#define KRB5_OID_LEN 7
-#define KRB5U2U_OID_LEN 8
-#define MSKRB5_OID_LEN 7
-static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
-static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
-static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
-static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
-static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01,
0x82, 0x37, 0x02, 0x02, 0x0a };
-static bool
-asn1_subid_decode(const unsigned char **begin, const unsigned char *end,
- unsigned long *subid)
-{
- const unsigned char *ptr = *begin;
- unsigned char ch;
-
- *subid = 0;
-
- do {
- if (ptr >= end)
- return false;
-
- ch = *ptr++;
- *subid <<= 7;
- *subid |= ch & 0x7F;
- } while ((ch & 0x80) == 0x80);
-
- *begin = ptr;
- return true;
-}
-
-static bool asn1_oid_decode(const unsigned char *value, size_t vlen,
- unsigned long **oid, size_t *oidlen)
-{
- const unsigned char *iptr = value, *end = value + vlen;
- unsigned long *optr;
- unsigned long subid;
-
- vlen += 1;
- if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long))
- goto fail_nullify;
-
- *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL);
- if (!*oid)
- return false;
-
- optr = *oid;
-
- if (!asn1_subid_decode(&iptr, end, &subid))
- goto fail;
-
- if (subid < 40) {
- optr[0] = 0;
- optr[1] = subid;
- } else if (subid < 80) {
- optr[0] = 1;
- optr[1] = subid - 40;
- } else {
- optr[0] = 2;
- optr[1] = subid - 80;
- }
-
- *oidlen = 2;
- optr += 2;
-
- while (iptr < end) {
- if (++(*oidlen) > vlen)
- goto fail;
-
- if (!asn1_subid_decode(&iptr, end, optr++))
- goto fail;
- }
- return true;
-
-fail:
- kfree(*oid);
-fail_nullify:
- *oid = NULL;
- return false;
-}
-
-static bool oid_eq(unsigned long *oid1, unsigned int oid1len,
- unsigned long *oid2, unsigned int oid2len)
-{
- if (oid1len != oid2len)
- return false;
-
- return memcmp(oid1, oid2, oid1len) == 0;
-}
-
int
ksmbd_decode_negTokenInit(unsigned char *security_blob, int length,
struct ksmbd_conn *conn)
@@ -252,26 +162,18 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen,
int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag,
const void *value, size_t vlen)
{
- unsigned long *oid;
- size_t oidlen;
- int err = 0;
-
- if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) {
- err = -EBADMSG;
- goto out;
- }
+ enum OID oid;
- if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN))
- err = -EBADMSG;
- kfree(oid);
-out:
- if (err) {
+ oid = look_up_OID(value, vlen);
+ if (oid != OID_spnego) {
char buf[50];
sprint_oid(value, vlen, buf, sizeof(buf));
ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf);
+ return -EBADMSG;
}
- return err;
+
+ return 0;
}
int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen,
@@ -279,37 +181,31 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen,
size_t vlen)
{
struct ksmbd_conn *conn = context;
- unsigned long *oid;
- size_t oidlen;
+ enum OID oid;
int mech_type;
- char buf[50];
- if (!asn1_oid_decode(value, vlen, &oid, &oidlen))
- goto fail;
-
- if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN))
+ oid = look_up_OID(value, vlen);
+ if (oid == OID_ntlmssp) {
mech_type = KSMBD_AUTH_NTLMSSP;
- else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN))
+ } else if (oid == OID_mskrb5) {
mech_type = KSMBD_AUTH_MSKRB5;
- else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN))
+ } else if (oid == OID_krb5) {
mech_type = KSMBD_AUTH_KRB5;
- else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN))
+ } else if (oid == OID_krb5u2u) {
mech_type = KSMBD_AUTH_KRB5U2U;
- else
- goto fail;
+ } else {
+ char buf[50];
+
+ sprint_oid(value, vlen, buf, sizeof(buf));
+ ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf);
+ return -EBADMSG;
+ }
conn->auth_mechs |= mech_type;
if (conn->preferred_auth_mech == 0)
conn->preferred_auth_mech = mech_type;
- kfree(oid);
return 0;
-
-fail:
- kfree(oid);
- sprint_oid(value, vlen, buf, sizeof(buf));
- ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf);
- return -EBADMSG;
}
int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen,
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 30a92ddc1817..911444d21267 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -29,6 +29,7 @@
#include "mgmt/user_config.h"
#include "crypto_ctx.h"
#include "transport_ipc.h"
+#include "../smbfs_common/arc4.h"
/*
* Fixed format data defining GSS header and fixed string
@@ -215,7 +216,7 @@ out:
* Return: 0 on success, error number on error
*/
int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name)
+ int blen, char *domain_name, char *cryptkey)
{
char ntlmv2_hash[CIFS_ENCPWD_SIZE];
char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE];
@@ -256,7 +257,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
goto out;
}
- memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE);
+ memcpy(construct, cryptkey, CIFS_CRYPTO_KEY_SIZE);
memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen);
rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len);
@@ -295,7 +296,8 @@ out:
* Return: 0 on success, error number on error
*/
int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
- int blob_len, struct ksmbd_session *sess)
+ int blob_len, struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
char *domain_name;
unsigned int nt_off, dn_off;
@@ -324,7 +326,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
/* TODO : use domain name that imported from configuration file */
domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off,
- dn_len, true, sess->conn->local_nls);
+ dn_len, true, conn->local_nls);
if (IS_ERR(domain_name))
return PTR_ERR(domain_name);
@@ -333,8 +335,31 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
domain_name);
ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off),
nt_len - CIFS_ENCPWD_SIZE,
- domain_name);
+ domain_name, conn->ntlmssp.cryptkey);
kfree(domain_name);
+
+ /* The recovered secondary session key */
+ if (conn->ntlmssp.client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) {
+ struct arc4_ctx *ctx_arc4;
+ unsigned int sess_key_off, sess_key_len;
+
+ sess_key_off = le32_to_cpu(authblob->SessionKey.BufferOffset);
+ sess_key_len = le16_to_cpu(authblob->SessionKey.Length);
+
+ if (blob_len < (u64)sess_key_off + sess_key_len)
+ return -EINVAL;
+
+ ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL);
+ if (!ctx_arc4)
+ return -ENOMEM;
+
+ cifs_arc4_setkey(ctx_arc4, sess->sess_key,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ cifs_arc4_crypt(ctx_arc4, sess->sess_key,
+ (char *)authblob + sess_key_off, sess_key_len);
+ kfree_sensitive(ctx_arc4);
+ }
+
return ret;
}
@@ -347,7 +372,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
*
*/
int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
- int blob_len, struct ksmbd_session *sess)
+ int blob_len, struct ksmbd_conn *conn)
{
if (blob_len < sizeof(struct negotiate_message)) {
ksmbd_debug(AUTH, "negotiate blob len %d too small\n",
@@ -361,7 +386,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
return -EINVAL;
}
- sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags);
+ conn->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags);
return 0;
}
@@ -375,14 +400,14 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
*/
unsigned int
ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
- struct ksmbd_session *sess)
+ struct ksmbd_conn *conn)
{
struct target_info *tinfo;
wchar_t *name;
__u8 *target_name;
unsigned int flags, blob_off, blob_len, type, target_info_len = 0;
int len, uni_len, conv_len;
- int cflags = sess->ntlmssp.client_flags;
+ int cflags = conn->ntlmssp.client_flags;
memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8);
chgblob->MessageType = NtLmChallenge;
@@ -403,10 +428,13 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
if (cflags & NTLMSSP_REQUEST_TARGET)
flags |= NTLMSSP_REQUEST_TARGET;
- if (sess->conn->use_spnego &&
+ if (conn->use_spnego &&
(cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+ if (cflags & NTLMSSP_NEGOTIATE_KEY_XCH)
+ flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+
chgblob->NegotiateFlags = cpu_to_le32(flags);
len = strlen(ksmbd_netbios_name());
name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL);
@@ -414,7 +442,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
return -ENOMEM;
conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len,
- sess->conn->local_nls);
+ conn->local_nls);
if (conv_len < 0 || conv_len > len) {
kfree(name);
return -EINVAL;
@@ -430,8 +458,8 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off);
/* Initialize random conn challenge */
- get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64));
- memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey,
+ get_random_bytes(conn->ntlmssp.cryptkey, sizeof(__u64));
+ memcpy(chgblob->Challenge, conn->ntlmssp.cryptkey,
CIFS_CRYPTO_KEY_SIZE);
/* Add Target Information to security buffer */
@@ -873,9 +901,9 @@ int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
__u8 *pi_hash)
{
int rc;
- struct smb2_hdr *rcv_hdr = (struct smb2_hdr *)buf;
+ struct smb2_hdr *rcv_hdr = smb2_get_msg(buf);
char *all_bytes_msg = (char *)&rcv_hdr->ProtocolId;
- int msg_size = be32_to_cpu(rcv_hdr->smb2_buf_length);
+ int msg_size = get_rfc1002_len(buf);
struct ksmbd_crypto_ctx *ctx = NULL;
if (conn->preauth_info->Preauth_HashId !=
@@ -983,7 +1011,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
u8 *sign)
{
struct scatterlist *sg;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0;
if (!nvec)
@@ -1047,9 +1075,8 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int nvec, int enc)
{
- struct smb2_transform_hdr *tr_hdr =
- (struct smb2_transform_hdr *)iov[0].iov_base;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base);
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc;
struct scatterlist *sg;
u8 sign[SMB2_SIGNATURE_SIZE] = {};
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
index 9c2d4badd05d..95629651cf26 100644
--- a/fs/ksmbd/auth.h
+++ b/fs/ksmbd/auth.h
@@ -38,16 +38,16 @@ struct kvec;
int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int nvec, int enc);
void ksmbd_copy_gss_neg_header(void *buf);
-int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf);
int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name);
+ int blen, char *domain_name, char *cryptkey);
int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
- int blob_len, struct ksmbd_session *sess);
+ int blob_len, struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
- int blob_len, struct ksmbd_session *sess);
+ int blob_len, struct ksmbd_conn *conn);
unsigned int
ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
- struct ksmbd_session *sess);
+ struct ksmbd_conn *conn);
int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
int in_len, char *out_blob, int *out_len);
int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov,
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index b57a0d8a392f..208d2cff7bd3 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -62,6 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
conn->total_credits = 1;
+ conn->outstanding_credits = 1;
init_waitqueue_head(&conn->req_running_q);
INIT_LIST_HEAD(&conn->conns_list);
@@ -158,26 +159,25 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn)
int ksmbd_conn_write(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb_hdr *rsp_hdr = work->response_buf;
size_t len = 0;
int sent;
struct kvec iov[3];
int iov_idx = 0;
ksmbd_conn_try_dequeue_request(work);
- if (!rsp_hdr) {
+ if (!work->response_buf) {
pr_err("NULL response header\n");
return -EINVAL;
}
if (work->tr_buf) {
iov[iov_idx] = (struct kvec) { work->tr_buf,
- sizeof(struct smb2_transform_hdr) };
+ sizeof(struct smb2_transform_hdr) + 4 };
len += iov[iov_idx++].iov_len;
}
if (work->aux_payload_sz) {
- iov[iov_idx] = (struct kvec) { rsp_hdr, work->resp_hdr_sz };
+ iov[iov_idx] = (struct kvec) { work->response_buf, work->resp_hdr_sz };
len += iov[iov_idx++].iov_len;
iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz };
len += iov[iov_idx++].iov_len;
@@ -185,8 +185,8 @@ int ksmbd_conn_write(struct ksmbd_work *work)
if (work->tr_buf)
iov[iov_idx].iov_len = work->resp_hdr_sz;
else
- iov[iov_idx].iov_len = get_rfc1002_len(rsp_hdr) + 4;
- iov[iov_idx].iov_base = rsp_hdr;
+ iov[iov_idx].iov_len = get_rfc1002_len(work->response_buf) + 4;
+ iov[iov_idx].iov_base = work->response_buf;
len += iov[iov_idx++].iov_len;
}
@@ -387,17 +387,24 @@ out:
static void stop_sessions(void)
{
struct ksmbd_conn *conn;
+ struct ksmbd_transport *t;
again:
read_lock(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list) {
struct task_struct *task;
- task = conn->transport->handler;
+ t = conn->transport;
+ task = t->handler;
if (task)
ksmbd_debug(CONN, "Stop session handler %s/%d\n",
task->comm, task_pid_nr(task));
conn->status = KSMBD_SESS_EXITING;
+ if (t->ops->shutdown) {
+ read_unlock(&conn_list_lock);
+ t->ops->shutdown(t);
+ read_lock(&conn_list_lock);
+ }
}
read_unlock(&conn_list_lock);
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index e5403c587a58..7a59aacb5daa 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -61,8 +61,8 @@ struct ksmbd_conn {
atomic_t req_running;
/* References which are made for this Server object*/
atomic_t r_count;
- unsigned short total_credits;
- unsigned short max_credits;
+ unsigned int total_credits;
+ unsigned int outstanding_credits;
spinlock_t credits_lock;
wait_queue_head_t req_running_q;
/* Lock to protect requests list*/
@@ -72,12 +72,7 @@ struct ksmbd_conn {
int connection_type;
struct ksmbd_stats stats;
char ClientGUID[SMB2_CLIENT_GUID_SIZE];
- union {
- /* pending trans request table */
- struct trans_state *recent_trans;
- /* Used by ntlmssp */
- char *ntlmssp_cryptkey;
- };
+ struct ntlmssp_auth ntlmssp;
spinlock_t llist_lock;
struct list_head lock_list;
@@ -122,6 +117,7 @@ struct ksmbd_conn_ops {
struct ksmbd_transport_ops {
int (*prepare)(struct ksmbd_transport *t);
void (*disconnect)(struct ksmbd_transport *t);
+ void (*shutdown)(struct ksmbd_transport *t);
int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size);
int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov,
int size, bool need_invalidate_rkey,
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index c6718a05d347..ebe6ca08467a 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -103,6 +103,8 @@ struct ksmbd_startup_request {
* we set the SPARSE_FILES bit (0x40).
*/
__u32 sub_auth[3]; /* Subauth value for Security ID */
+ __u32 smb2_max_credits; /* MAX credits */
+ __u32 reserved[128]; /* Reserved room */
__u32 ifc_list_sz; /* interfaces list size */
__s8 ____payload[];
};
@@ -113,7 +115,7 @@ struct ksmbd_startup_request {
* IPC request to shutdown ksmbd server.
*/
struct ksmbd_shutdown_request {
- __s32 reserved;
+ __s32 reserved[16];
};
/*
@@ -122,6 +124,7 @@ struct ksmbd_shutdown_request {
struct ksmbd_login_request {
__u32 handle;
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -135,6 +138,7 @@ struct ksmbd_login_response {
__u16 status;
__u16 hash_sz; /* hash size */
__s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -143,6 +147,7 @@ struct ksmbd_login_response {
struct ksmbd_share_config_request {
__u32 handle;
__s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -157,6 +162,7 @@ struct ksmbd_share_config_response {
__u16 force_directory_mode;
__u16 force_uid;
__u16 force_gid;
+ __u32 reserved[128]; /* Reserved room */
__u32 veto_list_sz;
__s8 ____payload[];
};
@@ -187,6 +193,7 @@ struct ksmbd_tree_connect_request {
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ];
__s8 share[KSMBD_REQ_MAX_SHARE_NAME];
__s8 peer_addr[64];
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -196,6 +203,7 @@ struct ksmbd_tree_connect_response {
__u32 handle;
__u16 status;
__u16 connection_flags;
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -204,6 +212,7 @@ struct ksmbd_tree_connect_response {
struct ksmbd_tree_disconnect_request {
__u64 session_id; /* session id */
__u64 connect_id; /* tree connection id */
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -212,6 +221,7 @@ struct ksmbd_tree_disconnect_request {
struct ksmbd_logout_request {
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
__u32 account_flags;
+ __u32 reserved[16]; /* Reserved room */
};
/*
@@ -231,7 +241,7 @@ struct ksmbd_rpc_command {
struct ksmbd_spnego_authen_request {
__u32 handle;
__u16 spnego_blob_len; /* the length of spnego_blob */
- __u8 spnego_blob[0]; /*
+ __u8 spnego_blob[]; /*
* the GSS token from SecurityBuffer of
* SMB2 SESSION SETUP request
*/
diff --git a/fs/ksmbd/ksmbd_work.c b/fs/ksmbd/ksmbd_work.c
index fd58eb4809f6..14b9caebf7a4 100644
--- a/fs/ksmbd/ksmbd_work.c
+++ b/fs/ksmbd/ksmbd_work.c
@@ -69,7 +69,6 @@ int ksmbd_workqueue_init(void)
void ksmbd_workqueue_destroy(void)
{
- flush_workqueue(ksmbd_wq);
destroy_workqueue(ksmbd_wq);
ksmbd_wq = NULL;
}
diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h
index f7156bc50049..5ece58e40c97 100644
--- a/fs/ksmbd/ksmbd_work.h
+++ b/fs/ksmbd/ksmbd_work.h
@@ -92,7 +92,7 @@ struct ksmbd_work {
*/
static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work)
{
- return work->response_buf + work->next_smb2_rsp_hdr_off;
+ return work->response_buf + work->next_smb2_rsp_hdr_off + 4;
}
/**
@@ -101,7 +101,7 @@ static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work)
*/
static inline void *ksmbd_req_buf_next(struct ksmbd_work *work)
{
- return work->request_buf + work->next_smb2_rcv_hdr_off;
+ return work->request_buf + work->next_smb2_rcv_hdr_off + 4;
}
struct ksmbd_work *ksmbd_alloc_work_struct(void);
diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c
index 1019d3677d55..279d00feff21 100644
--- a/fs/ksmbd/mgmt/user_config.c
+++ b/fs/ksmbd/mgmt/user_config.c
@@ -67,3 +67,13 @@ int ksmbd_anonymous_user(struct ksmbd_user *user)
return 1;
return 0;
}
+
+bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2)
+{
+ if (strcmp(u1->name, u2->name))
+ return false;
+ if (memcmp(u1->passkey, u2->passkey, u1->passkey_sz))
+ return false;
+
+ return true;
+}
diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h
index aff80b029579..6a44109617f1 100644
--- a/fs/ksmbd/mgmt/user_config.h
+++ b/fs/ksmbd/mgmt/user_config.h
@@ -64,4 +64,5 @@ struct ksmbd_user *ksmbd_login_user(const char *account);
struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp);
void ksmbd_free_user(struct ksmbd_user *user);
int ksmbd_anonymous_user(struct ksmbd_user *user);
+bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2);
#endif /* __USER_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h
index 82289c3cbd2b..e241f16a3851 100644
--- a/fs/ksmbd/mgmt/user_session.h
+++ b/fs/ksmbd/mgmt/user_session.h
@@ -45,7 +45,6 @@ struct ksmbd_session {
int state;
__u8 *Preauth_HashValue;
- struct ntlmssp_auth ntlmssp;
char sess_key[CIFS_KEY_SIZE];
struct hlist_node hlist;
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 60e7ac62c917..1e2076a53bed 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -158,19 +158,41 @@ out:
* Return : windows path string or error
*/
-char *convert_to_nt_pathname(char *filename)
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path)
{
- char *ab_pathname;
+ char *pathname, *ab_pathname, *nt_pathname;
+ int share_path_len = share->path_sz;
- if (strlen(filename) == 0)
- filename = "\\";
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return ERR_PTR(-EACCES);
- ab_pathname = kstrdup(filename, GFP_KERNEL);
- if (!ab_pathname)
- return NULL;
+ ab_pathname = d_path(path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ if (strncmp(ab_pathname, share->path, share_path_len)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL);
+ if (!nt_pathname) {
+ nt_pathname = ERR_PTR(-ENOMEM);
+ goto free_pathname;
+ }
+ if (ab_pathname[share_path_len] == '\0')
+ strcpy(nt_pathname, "/");
+ strcat(nt_pathname, &ab_pathname[share_path_len]);
+
+ ksmbd_conv_path_to_windows(nt_pathname);
- ksmbd_conv_path_to_windows(ab_pathname);
- return ab_pathname;
+free_pathname:
+ kfree(pathname);
+ return nt_pathname;
}
int get_nlink(struct kstat *st)
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index 253366bd0951..aae2a252945f 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -14,7 +14,8 @@ struct ksmbd_file;
int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
-char *convert_to_nt_pathname(char *filename);
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
index 8317f7ca402b..5052be9261d9 100644
--- a/fs/ksmbd/ndr.c
+++ b/fs/ksmbd/ndr.c
@@ -148,7 +148,7 @@ static int ndr_read_int16(struct ndr *n, __u16 *value)
static int ndr_read_int32(struct ndr *n, __u32 *value)
{
if (n->offset + sizeof(__u32) > n->length)
- return 0;
+ return -EINVAL;
if (value)
*value = le32_to_cpu(*(__le32 *)ndr_get_field(n));
diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h
index adaf4c0cbe8f..f13153c18b4e 100644
--- a/fs/ksmbd/ntlmssp.h
+++ b/fs/ksmbd/ntlmssp.h
@@ -95,7 +95,7 @@ struct security_buffer {
struct target_info {
__le16 Type;
__le16 Length;
- __u8 Content[0];
+ __u8 Content[];
} __packed;
struct negotiate_message {
@@ -108,7 +108,7 @@ struct negotiate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __packed;
@@ -140,7 +140,7 @@ struct authenticate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char UserString[0];
+ char UserString[];
} __packed;
struct ntlmv2_resp {
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index f9dae6ef2115..8b5560574d4c 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -629,10 +629,10 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
return;
}
- rsp_hdr = work->response_buf;
+ rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- rsp_hdr->smb2_buf_length =
- cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ *(__be32 *)work->response_buf =
+ cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->CreditRequest = cpu_to_le16(0);
@@ -645,7 +645,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
rsp_hdr->SessionId = 0;
memset(rsp_hdr->Signature, 0, 16);
- rsp = work->response_buf;
+ rsp = smb2_get_msg(work->response_buf);
rsp->StructureSize = cpu_to_le16(24);
if (!br_info->open_trunc &&
@@ -656,10 +656,10 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE;
rsp->Reserved = 0;
rsp->Reserved2 = 0;
- rsp->PersistentFid = cpu_to_le64(fp->persistent_id);
- rsp->VolatileFid = cpu_to_le64(fp->volatile_id);
+ rsp->PersistentFid = fp->persistent_id;
+ rsp->VolatileFid = fp->volatile_id;
- inc_rfc1001_len(rsp, 24);
+ inc_rfc1001_len(work->response_buf, 24);
ksmbd_debug(OPLOCK,
"sending oplock break v_id %llu p_id = %llu lock level = %d\n",
@@ -736,10 +736,10 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
return;
}
- rsp_hdr = work->response_buf;
+ rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- rsp_hdr->smb2_buf_length =
- cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ *(__be32 *)work->response_buf =
+ cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->CreditRequest = cpu_to_le16(0);
@@ -752,7 +752,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
rsp_hdr->SessionId = 0;
memset(rsp_hdr->Signature, 0, 16);
- rsp = work->response_buf;
+ rsp = smb2_get_msg(work->response_buf);
rsp->StructureSize = cpu_to_le16(44);
rsp->Epoch = br_info->epoch;
rsp->Flags = 0;
@@ -768,7 +768,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
rsp->AccessMaskHint = 0;
rsp->ShareMaskHint = 0;
- inc_rfc1001_len(rsp, 44);
+ inc_rfc1001_len(work->response_buf, 44);
ksmbd_conn_write(work);
ksmbd_free_work_struct(work);
@@ -1335,19 +1335,16 @@ __u8 smb2_map_lease_to_oplock(__le32 lease_state)
*/
void create_lease_buf(u8 *rbuf, struct lease *lease)
{
- char *LeaseKey = (char *)&lease->lease_key;
-
if (lease->version == 2) {
struct create_lease_v2 *buf = (struct create_lease_v2 *)rbuf;
- char *ParentLeaseKey = (char *)&lease->parent_lease_key;
memset(buf, 0, sizeof(struct create_lease_v2));
- buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey);
- buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8));
+ memcpy(buf->lcontext.LeaseKey, lease->lease_key,
+ SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseFlags = lease->flags;
buf->lcontext.LeaseState = lease->state;
- buf->lcontext.ParentLeaseKeyLow = *((__le64 *)ParentLeaseKey);
- buf->lcontext.ParentLeaseKeyHigh = *((__le64 *)(ParentLeaseKey + 8));
+ memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
+ SMB2_LEASE_KEY_SIZE);
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_lease_v2, lcontext));
buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
@@ -1362,8 +1359,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
struct create_lease *buf = (struct create_lease *)rbuf;
memset(buf, 0, sizeof(struct create_lease));
- buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey);
- buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8));
+ memcpy(buf->lcontext.LeaseKey, lease->lease_key, SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseFlags = lease->flags;
buf->lcontext.LeaseState = lease->state;
buf->ccontext.DataOffset = cpu_to_le16(offsetof
@@ -1398,7 +1394,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
if (!lreq)
return NULL;
- data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset);
+ data_offset = (char *)req + le32_to_cpu(req->CreateContextsOffset);
cc = (struct create_context *)data_offset;
do {
cc = (struct create_context *)((char *)cc + next);
@@ -1416,19 +1412,17 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
- *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow;
- *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh;
+ memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
lreq->duration = lc->lcontext.LeaseDuration;
- *((__le64 *)lreq->parent_lease_key) = lc->lcontext.ParentLeaseKeyLow;
- *((__le64 *)(lreq->parent_lease_key + 8)) = lc->lcontext.ParentLeaseKeyHigh;
+ memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
+ SMB2_LEASE_KEY_SIZE);
lreq->version = 2;
} else {
struct create_lease *lc = (struct create_lease *)cc;
- *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow;
- *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh;
+ memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
lreq->duration = lc->lcontext.LeaseDuration;
@@ -1462,7 +1456,7 @@ struct create_context *smb2_find_context_vals(void *open_req, const char *tag)
* CreateContextsOffset and CreateContextsLength are guaranteed to
* be valid because of ksmbd_smb2_check_message().
*/
- cc = (struct create_context *)((char *)req + 4 +
+ cc = (struct create_context *)((char *)req +
le32_to_cpu(req->CreateContextsOffset));
remain_len = le32_to_cpu(req->CreateContextsLength);
do {
@@ -1700,33 +1694,3 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
-
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name)
-{
- struct oplock_info *opinfo = opinfo_get(fp);
- int ret = 0;
-
- if (opinfo && opinfo->is_lease) {
- if (!lctx) {
- pr_err("open does not include lease\n");
- ret = -EBADF;
- goto out;
- }
- if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- pr_err("invalid lease key\n");
- ret = -EBADF;
- goto out;
- }
- if (name && strcmp(fp->filename, name)) {
- pr_err("invalid name reconnect %s\n", name);
- ret = -EINVAL;
- goto out;
- }
- }
-out:
- if (opinfo)
- opinfo_put(opinfo);
- return ret;
-}
diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h
index 119b8047cfbd..09753448f779 100644
--- a/fs/ksmbd/oplock.h
+++ b/fs/ksmbd/oplock.h
@@ -28,8 +28,6 @@
#define OPLOCK_WRITE_TO_NONE 0x04
#define OPLOCK_READ_TO_NONE 0x08
-#define SMB2_LEASE_KEY_SIZE 16
-
struct lease_ctx_info {
__u8 lease_key[SMB2_LEASE_KEY_SIZE];
__le32 req_state;
@@ -126,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
struct lease_ctx_info *lctx);
void destroy_lease_table(struct ksmbd_conn *conn);
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 2a2b2135bfde..4cd03d661df0 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -585,7 +585,7 @@ static int __init ksmbd_server_init(void)
if (ret)
goto err_crypto_destroy;
- pr_warn_once("The ksmbd server is experimental, use at your own risk.\n");
+ pr_warn_once("The ksmbd server is experimental\n");
return 0;
@@ -622,7 +622,6 @@ MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: ecb");
MODULE_SOFTDEP("pre: hmac");
-MODULE_SOFTDEP("pre: md4");
MODULE_SOFTDEP("pre: md5");
MODULE_SOFTDEP("pre: nls");
MODULE_SOFTDEP("pre: aes");
@@ -632,5 +631,6 @@ MODULE_SOFTDEP("pre: sha512");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
MODULE_SOFTDEP("pre: gcm");
+MODULE_SOFTDEP("pre: crc32");
module_init(ksmbd_server_init)
module_exit(ksmbd_server_exit)
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 030ca57c3784..4a9460153b59 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -6,7 +6,6 @@
#include "glob.h"
#include "nterr.h"
-#include "smb2pdu.h"
#include "smb_common.h"
#include "smbstatus.h"
#include "mgmt/user_session.h"
@@ -290,7 +289,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge);
void *__hdr = hdr;
- int ret;
+ int ret = 0;
switch (hdr->Command) {
case SMB2_QUERY_INFO:
@@ -327,43 +326,42 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n",
credit_charge, calc_credit_num);
return 1;
- } else if (credit_charge > conn->max_credits) {
+ } else if (credit_charge > conn->vals->max_credits) {
ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge);
return 1;
}
spin_lock(&conn->credits_lock);
- if (credit_charge <= conn->total_credits) {
- conn->total_credits -= credit_charge;
- ret = 0;
- } else {
+ if (credit_charge > conn->total_credits) {
ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n",
credit_charge, conn->total_credits);
ret = 1;
}
+
+ if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) {
+ ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n",
+ credit_charge, conn->outstanding_credits);
+ ret = 1;
+ } else
+ conn->outstanding_credits += credit_charge;
+
spin_unlock(&conn->credits_lock);
+
return ret;
}
int ksmbd_smb2_check_message(struct ksmbd_work *work)
{
- struct smb2_pdu *pdu = work->request_buf;
+ struct smb2_pdu *pdu = ksmbd_req_buf_next(work);
struct smb2_hdr *hdr = &pdu->hdr;
int command;
__u32 clc_len; /* calculated length */
- __u32 len = get_rfc1002_len(pdu);
-
- if (work->next_smb2_rcv_hdr_off) {
- pdu = ksmbd_req_buf_next(work);
- hdr = &pdu->hdr;
- }
+ __u32 len = get_rfc1002_len(work->request_buf);
- if (le32_to_cpu(hdr->NextCommand) > 0) {
+ if (le32_to_cpu(hdr->NextCommand) > 0)
len = le32_to_cpu(hdr->NextCommand);
- } else if (work->next_smb2_rcv_hdr_off) {
+ else if (work->next_smb2_rcv_hdr_off)
len -= work->next_smb2_rcv_hdr_off;
- len = round_up(len, 8);
- }
if (check_smb2_hdr(hdr))
return 1;
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index fb6a65d23139..ab23da2120b9 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -6,7 +6,6 @@
#include <linux/slab.h>
#include "glob.h"
-#include "smb2pdu.h"
#include "auth.h"
#include "connection.h"
@@ -20,6 +19,7 @@ static struct smb_version_values smb21_server_values = {
.max_read_size = SMB21_DEFAULT_IOSIZE,
.max_write_size = SMB21_DEFAULT_IOSIZE,
.max_trans_size = SMB21_DEFAULT_IOSIZE,
+ .max_credits = SMB2_MAX_CREDITS,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
.shared_lock_type = SMB2_LOCKFLAG_SHARED,
@@ -45,6 +45,7 @@ static struct smb_version_values smb30_server_values = {
.max_read_size = SMB3_DEFAULT_IOSIZE,
.max_write_size = SMB3_DEFAULT_IOSIZE,
.max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .max_credits = SMB2_MAX_CREDITS,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
.shared_lock_type = SMB2_LOCKFLAG_SHARED,
@@ -71,6 +72,7 @@ static struct smb_version_values smb302_server_values = {
.max_read_size = SMB3_DEFAULT_IOSIZE,
.max_write_size = SMB3_DEFAULT_IOSIZE,
.max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .max_credits = SMB2_MAX_CREDITS,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
.shared_lock_type = SMB2_LOCKFLAG_SHARED,
@@ -97,6 +99,7 @@ static struct smb_version_values smb311_server_values = {
.max_read_size = SMB3_DEFAULT_IOSIZE,
.max_write_size = SMB3_DEFAULT_IOSIZE,
.max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .max_credits = SMB2_MAX_CREDITS,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
.shared_lock_type = SMB2_LOCKFLAG_SHARED,
@@ -198,8 +201,7 @@ void init_smb2_1_server(struct ksmbd_conn *conn)
conn->ops = &smb2_0_server_ops;
conn->cmds = smb2_0_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
- conn->max_credits = SMB2_MAX_CREDITS;
- conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256;
+ conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
@@ -216,8 +218,7 @@ void init_smb3_0_server(struct ksmbd_conn *conn)
conn->ops = &smb3_0_server_ops;
conn->cmds = smb2_0_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
- conn->max_credits = SMB2_MAX_CREDITS;
- conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
@@ -241,8 +242,7 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
conn->ops = &smb3_0_server_ops;
conn->cmds = smb2_0_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
- conn->max_credits = SMB2_MAX_CREDITS;
- conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
@@ -266,15 +266,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
conn->ops = &smb3_11_server_ops;
conn->cmds = smb2_0_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
- conn->max_credits = SMB2_MAX_CREDITS;
- conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
- if (conn->cipher_type)
- conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
-
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
@@ -308,3 +304,11 @@ void init_smb2_max_trans_size(unsigned int sz)
smb302_server_values.max_trans_size = sz;
smb311_server_values.max_trans_size = sz;
}
+
+void init_smb2_max_credits(unsigned int sz)
+{
+ smb21_server_values.max_credits = sz;
+ smb30_server_values.max_credits = sz;
+ smb302_server_values.max_credits = sz;
+ smb311_server_values.max_credits = sz;
+}
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 7e448df3f847..16c803a9d996 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -11,9 +11,9 @@
#include <linux/statfs.h>
#include <linux/ethtool.h>
#include <linux/falloc.h>
+#include <linux/mount.h>
#include "glob.h"
-#include "smb2pdu.h"
#include "smbfsctl.h"
#include "oplock.h"
#include "smbacl.h"
@@ -44,8 +44,8 @@ static void __wbuf(struct ksmbd_work *work, void **req, void **rsp)
*req = ksmbd_req_buf_next(work);
*rsp = ksmbd_resp_buf_next(work);
} else {
- *req = work->request_buf;
- *rsp = work->response_buf;
+ *req = smb2_get_msg(work->request_buf);
+ *rsp = smb2_get_msg(work->response_buf);
}
}
@@ -93,13 +93,14 @@ struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn
*/
int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
{
- struct smb2_hdr *req_hdr = work->request_buf;
+ struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf);
+ unsigned int cmd = le16_to_cpu(req_hdr->Command);
int tree_id;
work->tcon = NULL;
- if (work->conn->ops->get_cmd_val(work) == SMB2_TREE_CONNECT_HE ||
- work->conn->ops->get_cmd_val(work) == SMB2_CANCEL_HE ||
- work->conn->ops->get_cmd_val(work) == SMB2_LOGOFF_HE) {
+ if (cmd == SMB2_TREE_CONNECT_HE ||
+ cmd == SMB2_CANCEL_HE ||
+ cmd == SMB2_LOGOFF_HE) {
ksmbd_debug(SMB, "skip to check tree connect request\n");
return 0;
}
@@ -130,7 +131,7 @@ void smb2_set_err_rsp(struct ksmbd_work *work)
if (work->next_smb2_rcv_hdr_off)
err_rsp = ksmbd_resp_buf_next(work);
else
- err_rsp = work->response_buf;
+ err_rsp = smb2_get_msg(work->response_buf);
if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) {
err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE;
@@ -150,7 +151,7 @@ void smb2_set_err_rsp(struct ksmbd_work *work)
*/
bool is_smb2_neg_cmd(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr = work->request_buf;
+ struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
/* is it SMB2 header ? */
if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
@@ -174,7 +175,7 @@ bool is_smb2_neg_cmd(struct ksmbd_work *work)
*/
bool is_smb2_rsp(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr = work->response_buf;
+ struct smb2_hdr *hdr = smb2_get_msg(work->response_buf);
/* is it SMB2 header ? */
if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
@@ -200,7 +201,7 @@ u16 get_smb2_cmd_val(struct ksmbd_work *work)
if (work->next_smb2_rcv_hdr_off)
rcv_hdr = ksmbd_req_buf_next(work);
else
- rcv_hdr = work->request_buf;
+ rcv_hdr = smb2_get_msg(work->request_buf);
return le16_to_cpu(rcv_hdr->Command);
}
@@ -216,7 +217,7 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err)
if (work->next_smb2_rcv_hdr_off)
rsp_hdr = ksmbd_resp_buf_next(work);
else
- rsp_hdr = work->response_buf;
+ rsp_hdr = smb2_get_msg(work->response_buf);
rsp_hdr->Status = err;
smb2_set_err_rsp(work);
}
@@ -237,13 +238,11 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
if (conn->need_neg == false)
return -EINVAL;
- rsp_hdr = work->response_buf;
+ *(__be32 *)work->response_buf =
+ cpu_to_be32(conn->vals->header_size);
+ rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
-
- rsp_hdr->smb2_buf_length =
- cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
-
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->CreditRequest = cpu_to_le16(2);
@@ -256,7 +255,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
rsp_hdr->SessionId = 0;
memset(rsp_hdr->Signature, 0, 16);
- rsp = work->response_buf;
+ rsp = smb2_get_msg(work->response_buf);
WARN_ON(ksmbd_conn_good(work));
@@ -277,12 +276,12 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
rsp->SecurityBufferOffset = cpu_to_le16(128);
rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
- ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) +
- sizeof(rsp->hdr.smb2_buf_length)) +
+ ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) +
le16_to_cpu(rsp->SecurityBufferOffset));
- inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) -
- sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
- AUTH_GSS_LENGTH);
+ inc_rfc1001_len(work->response_buf,
+ sizeof(struct smb2_negotiate_rsp) -
+ sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
+ AUTH_GSS_LENGTH);
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY)
rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE;
@@ -301,16 +300,15 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
struct smb2_hdr *hdr = ksmbd_resp_buf_next(work);
struct ksmbd_conn *conn = work->conn;
- unsigned short credits_requested;
+ unsigned short credits_requested, aux_max;
unsigned short credit_charge, credits_granted = 0;
- unsigned short aux_max, aux_credits;
if (work->send_no_response)
return 0;
hdr->CreditCharge = req_hdr->CreditCharge;
- if (conn->total_credits > conn->max_credits) {
+ if (conn->total_credits > conn->vals->max_credits) {
hdr->CreditRequest = 0;
pr_err("Total credits overflow: %d\n", conn->total_credits);
return -EINVAL;
@@ -318,6 +316,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
credit_charge = max_t(unsigned short,
le16_to_cpu(req_hdr->CreditCharge), 1);
+ if (credit_charge > conn->total_credits) {
+ ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n",
+ credit_charge, conn->total_credits);
+ return -EINVAL;
+ }
+
+ conn->total_credits -= credit_charge;
+ conn->outstanding_credits -= credit_charge;
credits_requested = max_t(unsigned short,
le16_to_cpu(req_hdr->CreditRequest), 1);
@@ -327,16 +333,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
* TODO: Need to adjuct CreditRequest value according to
* current cpu load
*/
- aux_credits = credits_requested - 1;
if (hdr->Command == SMB2_NEGOTIATE)
- aux_max = 0;
+ aux_max = 1;
else
- aux_max = conn->max_credits - credit_charge;
- aux_credits = min_t(unsigned short, aux_credits, aux_max);
- credits_granted = credit_charge + aux_credits;
+ aux_max = conn->vals->max_credits - credit_charge;
+ credits_granted = min_t(unsigned short, credits_requested, aux_max);
- if (conn->max_credits - conn->total_credits < credits_granted)
- credits_granted = conn->max_credits -
+ if (conn->vals->max_credits - conn->total_credits < credits_granted)
+ credits_granted = conn->vals->max_credits -
conn->total_credits;
conn->total_credits += credits_granted;
@@ -374,12 +378,8 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
* command in the compound request
*/
if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) {
- work->compound_fid =
- le64_to_cpu(((struct smb2_create_rsp *)rsp)->
- VolatileFileId);
- work->compound_pfid =
- le64_to_cpu(((struct smb2_create_rsp *)rsp)->
- PersistentFileId);
+ work->compound_fid = ((struct smb2_create_rsp *)rsp)->VolatileFileId;
+ work->compound_pfid = ((struct smb2_create_rsp *)rsp)->PersistentFileId;
work->compound_sid = le64_to_cpu(rsp->SessionId);
}
@@ -387,8 +387,8 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
next_hdr_offset = le32_to_cpu(req->NextCommand);
new_len = ALIGN(len, 8);
- inc_rfc1001_len(work->response_buf, ((sizeof(struct smb2_hdr) - 4)
- + new_len - len));
+ inc_rfc1001_len(work->response_buf,
+ sizeof(struct smb2_hdr) + new_len - len);
rsp->NextCommand = cpu_to_le32(new_len);
work->next_smb2_rcv_hdr_off += next_hdr_offset;
@@ -406,7 +406,7 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
work->compound_fid = KSMBD_NO_FID;
work->compound_pfid = KSMBD_NO_FID;
}
- memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2);
+ memset((char *)rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->Command = rcv_hdr->Command;
@@ -432,7 +432,7 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
*/
bool is_chained_smb2_message(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr = work->request_buf;
+ struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
unsigned int len, next_cmd;
if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
@@ -483,13 +483,13 @@ bool is_chained_smb2_message(struct ksmbd_work *work)
*/
int init_smb2_rsp_hdr(struct ksmbd_work *work)
{
- struct smb2_hdr *rsp_hdr = work->response_buf;
- struct smb2_hdr *rcv_hdr = work->request_buf;
+ struct smb2_hdr *rsp_hdr = smb2_get_msg(work->response_buf);
+ struct smb2_hdr *rcv_hdr = smb2_get_msg(work->request_buf);
struct ksmbd_conn *conn = work->conn;
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- rsp_hdr->smb2_buf_length =
- cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ *(__be32 *)work->response_buf =
+ cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = rcv_hdr->ProtocolId;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->Command = rcv_hdr->Command;
@@ -522,7 +522,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work)
*/
int smb2_allocate_rsp_buf(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr = work->request_buf;
+ struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
size_t large_sz = small_sz + work->conn->vals->max_trans_size;
size_t sz = small_sz;
@@ -534,7 +534,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
if (cmd == SMB2_QUERY_INFO_HE) {
struct smb2_query_info_req *req;
- req = work->request_buf;
+ req = smb2_get_msg(work->request_buf);
if (req->InfoType == SMB2_O_INFO_FILE &&
(req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
req->FileInfoClass == FILE_ALL_INFORMATION))
@@ -561,7 +561,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
*/
int smb2_check_user_session(struct ksmbd_work *work)
{
- struct smb2_hdr *req_hdr = work->request_buf;
+ struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf);
struct ksmbd_conn *conn = work->conn;
unsigned int cmd = conn->ops->get_cmd_val(work);
unsigned long long sess_id;
@@ -612,16 +612,14 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id)
/**
* smb2_get_name() - get filename string from on the wire smb format
- * @share: ksmbd_share_config pointer
* @src: source buffer
* @maxlen: maxlen of source string
- * @nls_table: nls_table pointer
+ * @local_nls: nls_table pointer
*
* Return: matching converted filename on success, otherwise error ptr
*/
static char *
-smb2_get_name(struct ksmbd_share_config *share, const char *src,
- const int maxlen, struct nls_table *local_nls)
+smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
{
char *name;
@@ -642,7 +640,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
struct ksmbd_conn *conn = work->conn;
int id;
- rsp_hdr = work->response_buf;
+ rsp_hdr = smb2_get_msg(work->response_buf);
rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
id = ksmbd_acquire_async_msg_id(&conn->async_ida);
@@ -674,7 +672,7 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
{
struct smb2_hdr *rsp_hdr;
- rsp_hdr = work->response_buf;
+ rsp_hdr = smb2_get_msg(work->response_buf);
smb2_set_err_rsp(work);
rsp_hdr->Status = status;
@@ -715,17 +713,17 @@ static int smb2_get_dos_mode(struct kstat *stat, int attribute)
int attr = 0;
if (S_ISDIR(stat->mode)) {
- attr = ATTR_DIRECTORY |
- (attribute & (ATTR_HIDDEN | ATTR_SYSTEM));
+ attr = FILE_ATTRIBUTE_DIRECTORY |
+ (attribute & (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM));
} else {
- attr = (attribute & 0x00005137) | ATTR_ARCHIVE;
- attr &= ~(ATTR_DIRECTORY);
+ attr = (attribute & 0x00005137) | FILE_ATTRIBUTE_ARCHIVE;
+ attr &= ~(FILE_ATTRIBUTE_DIRECTORY);
if (S_ISREG(stat->mode) && (server_conf.share_fake_fscaps &
FILE_SUPPORTS_SPARSE_FILES))
- attr |= ATTR_SPARSE;
+ attr |= FILE_ATTRIBUTE_SPARSE_FILE;
if (smb2_get_reparse_tag_special_file(stat->mode))
- attr |= ATTR_REPARSE;
+ attr |= FILE_ATTRIBUTE_REPARSE_POINT;
}
return attr;
@@ -753,16 +751,16 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt,
pneg_ctxt->Ciphers[0] = cipher_type;
}
-static void build_compression_ctxt(struct smb2_compression_ctx *pneg_ctxt,
+static void build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt,
__le16 comp_algo)
{
pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES;
pneg_ctxt->DataLength =
- cpu_to_le16(sizeof(struct smb2_compression_ctx)
+ cpu_to_le16(sizeof(struct smb2_compression_capabilities_context)
- sizeof(struct smb2_neg_context));
pneg_ctxt->Reserved = cpu_to_le32(0);
pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->Reserved1 = cpu_to_le32(0);
+ pneg_ctxt->Flags = cpu_to_le32(0);
pneg_ctxt->CompressionAlgorithms[0] = comp_algo;
}
@@ -802,11 +800,11 @@ static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
}
static void assemble_neg_contexts(struct ksmbd_conn *conn,
- struct smb2_negotiate_rsp *rsp)
+ struct smb2_negotiate_rsp *rsp,
+ void *smb2_buf_len)
{
- /* +4 is to account for the RFC1001 len field */
char *pneg_ctxt = (char *)rsp +
- le32_to_cpu(rsp->NegotiateContextOffset) + 4;
+ le32_to_cpu(rsp->NegotiateContextOffset);
int neg_ctxt_cnt = 1;
int ctxt_size;
@@ -815,7 +813,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt,
conn->preauth_info->Preauth_HashId);
rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
- inc_rfc1001_len(rsp, AUTH_GSS_PADDING);
+ inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING);
ctxt_size = sizeof(struct smb2_preauth_neg_context);
/* Round to 8 byte boundary */
pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8);
@@ -839,12 +837,12 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
ksmbd_debug(SMB,
"assemble SMB2_COMPRESSION_CAPABILITIES context\n");
/* Temporarily set to SMB3_COMPRESS_NONE */
- build_compression_ctxt((struct smb2_compression_ctx *)pneg_ctxt,
+ build_compression_ctxt((struct smb2_compression_capabilities_context *)pneg_ctxt,
conn->compress_algorithm);
rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
- ctxt_size += sizeof(struct smb2_compression_ctx) + 2;
+ ctxt_size += sizeof(struct smb2_compression_capabilities_context) + 2;
/* Round to 8 byte boundary */
- pneg_ctxt += round_up(sizeof(struct smb2_compression_ctx) + 2,
+ pneg_ctxt += round_up(sizeof(struct smb2_compression_capabilities_context) + 2,
8);
}
@@ -869,7 +867,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
ctxt_size += sizeof(struct smb2_signing_capabilities) + 2;
}
- inc_rfc1001_len(rsp, ctxt_size);
+ inc_rfc1001_len(smb2_buf_len, ctxt_size);
}
static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn,
@@ -917,8 +915,27 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
}
}
+/**
+ * smb3_encryption_negotiated() - checks if server and client agreed on enabling encryption
+ * @conn: smb connection
+ *
+ * Return: true if connection should be encrypted, else false
+ */
+static bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
+{
+ if (!conn->ops->generate_encryptionkey)
+ return false;
+
+ /*
+ * SMB 3.0 and 3.0.2 dialects use the SMB2_GLOBAL_CAP_ENCRYPTION flag.
+ * SMB 3.1.1 uses the cipher_type field.
+ */
+ return (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) ||
+ conn->cipher_type;
+}
+
static void decode_compress_ctxt(struct ksmbd_conn *conn,
- struct smb2_compression_ctx *pneg_ctxt)
+ struct smb2_compression_capabilities_context *pneg_ctxt)
{
conn->compress_algorithm = SMB3_COMPRESS_NONE;
}
@@ -939,8 +956,8 @@ static void decode_sign_cap_ctxt(struct ksmbd_conn *conn,
}
for (i = 0; i < sign_algo_cnt; i++) {
- if (pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_HMAC_SHA256 ||
- pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_AES_CMAC) {
+ if (pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_HMAC_SHA256_LE ||
+ pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_AES_CMAC_LE) {
ksmbd_debug(SMB, "Signing Algorithm ID = 0x%x\n",
pneg_ctxt->SigningAlgorithms[i]);
conn->signing_negotiated = true;
@@ -952,14 +969,14 @@ static void decode_sign_cap_ctxt(struct ksmbd_conn *conn,
}
static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
- struct smb2_negotiate_req *req)
+ struct smb2_negotiate_req *req,
+ int len_of_smb)
{
/* +4 is to account for the RFC1001 len field */
- struct smb2_neg_context *pctx = (struct smb2_neg_context *)((char *)req + 4);
+ struct smb2_neg_context *pctx = (struct smb2_neg_context *)req;
int i = 0, len_of_ctxts;
int offset = le32_to_cpu(req->NegotiateContextOffset);
int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount);
- int len_of_smb = be32_to_cpu(req->hdr.smb2_buf_length);
__le32 status = STATUS_INVALID_PARAMETER;
ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt);
@@ -1011,7 +1028,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
break;
decode_compress_ctxt(conn,
- (struct smb2_compression_ctx *)pctx);
+ (struct smb2_compression_capabilities_context *)pctx);
} else if (pctx->ContextType == SMB2_NETNAME_NEGOTIATE_CONTEXT_ID) {
ksmbd_debug(SMB,
"deassemble SMB2_NETNAME_NEGOTIATE_CONTEXT_ID context\n");
@@ -1044,8 +1061,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
int smb2_handle_negotiate(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_negotiate_req *req = work->request_buf;
- struct smb2_negotiate_rsp *rsp = work->response_buf;
+ struct smb2_negotiate_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_negotiate_rsp *rsp = smb2_get_msg(work->response_buf);
int rc = 0;
unsigned int smb2_buf_len, smb2_neg_size;
__le32 status;
@@ -1066,7 +1083,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
}
smb2_buf_len = get_rfc1002_len(work->request_buf);
- smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects) - 4;
+ smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects);
if (smb2_neg_size > smb2_buf_len) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
rc = -EINVAL;
@@ -1115,7 +1132,8 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
goto err_out;
}
- status = deassemble_neg_contexts(conn, req);
+ status = deassemble_neg_contexts(conn, req,
+ get_rfc1002_len(work->request_buf));
if (status != STATUS_SUCCESS) {
pr_err("deassemble_neg_contexts error(0x%x)\n",
status);
@@ -1135,7 +1153,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
conn->preauth_info->Preauth_HashValue);
rsp->NegotiateContextOffset =
cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
- assemble_neg_contexts(conn, rsp);
+ assemble_neg_contexts(conn, rsp, work->response_buf);
break;
case SMB302_PROT_ID:
init_smb3_02_server(conn);
@@ -1183,10 +1201,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
rsp->SecurityBufferOffset = cpu_to_le16(128);
rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
- ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) +
- sizeof(rsp->hdr.smb2_buf_length)) +
- le16_to_cpu(rsp->SecurityBufferOffset));
- inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) -
+ ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) +
+ le16_to_cpu(rsp->SecurityBufferOffset));
+ inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) -
sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
AUTH_GSS_LENGTH);
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
@@ -1278,7 +1295,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
struct negotiate_message *negblob,
size_t negblob_len)
{
- struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf);
struct challenge_message *chgblob;
unsigned char *spnego_blob = NULL;
u16 spnego_blob_len;
@@ -1286,7 +1303,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
int sz, rc;
ksmbd_debug(SMB, "negotiate phase\n");
- rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess);
+ rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->conn);
if (rc)
return rc;
@@ -1296,7 +1313,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
memset(chgblob, 0, sizeof(struct challenge_message));
if (!work->conn->use_spnego) {
- sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess);
+ sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn);
if (sz < 0)
return -ENOMEM;
@@ -1312,7 +1329,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
return -ENOMEM;
chgblob = (struct challenge_message *)neg_blob;
- sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess);
+ sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn);
if (sz < 0) {
rc = -ENOMEM;
goto out;
@@ -1386,8 +1403,8 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
static int ntlm_authenticate(struct ksmbd_work *work)
{
- struct smb2_sess_setup_req *req = work->request_buf;
- struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
struct channel *chann = NULL;
@@ -1410,7 +1427,7 @@ static int ntlm_authenticate(struct ksmbd_work *work)
memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
- inc_rfc1001_len(rsp, spnego_blob_len - 1);
+ inc_rfc1001_len(work->response_buf, spnego_blob_len - 1);
}
user = session_user(conn, req);
@@ -1433,61 +1450,62 @@ static int ntlm_authenticate(struct ksmbd_work *work)
ksmbd_free_user(user);
return 0;
}
- ksmbd_free_user(sess->user);
- }
- sess->user = user;
- if (user_guest(sess->user)) {
- if (conn->sign) {
- ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n");
+ if (!ksmbd_compare_user(sess->user, user)) {
+ ksmbd_free_user(user);
return -EPERM;
}
+ ksmbd_free_user(user);
+ } else {
+ sess->user = user;
+ }
+ if (user_guest(sess->user)) {
rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE;
} else {
struct authenticate_message *authblob;
authblob = user_authblob(conn, req);
sz = le16_to_cpu(req->SecurityBufferLength);
- rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess);
+ rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess);
if (rc) {
set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
ksmbd_debug(SMB, "authentication failed\n");
return -EPERM;
}
+ }
- /*
- * If session state is SMB2_SESSION_VALID, We can assume
- * that it is reauthentication. And the user/password
- * has been verified, so return it here.
- */
- if (sess->state == SMB2_SESSION_VALID) {
- if (conn->binding)
- goto binding_session;
- return 0;
- }
+ /*
+ * If session state is SMB2_SESSION_VALID, We can assume
+ * that it is reauthentication. And the user/password
+ * has been verified, so return it here.
+ */
+ if (sess->state == SMB2_SESSION_VALID) {
+ if (conn->binding)
+ goto binding_session;
+ return 0;
+ }
- if ((conn->sign || server_conf.enforced_signing) ||
- (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
- sess->sign = true;
+ if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE &&
+ (conn->sign || server_conf.enforced_signing)) ||
+ (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
+ sess->sign = true;
- if (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION &&
- conn->ops->generate_encryptionkey &&
- !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
- rc = conn->ops->generate_encryptionkey(sess);
- if (rc) {
- ksmbd_debug(SMB,
- "SMB3 encryption key generation failed\n");
- return -EINVAL;
- }
- sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
- /*
- * signing is disable if encryption is enable
- * on this session
- */
- sess->sign = false;
+ if (smb3_encryption_negotiated(conn) &&
+ !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
+ rc = conn->ops->generate_encryptionkey(sess);
+ if (rc) {
+ ksmbd_debug(SMB,
+ "SMB3 encryption key generation failed\n");
+ return -EINVAL;
}
+ sess->enc = true;
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ /*
+ * signing is disable if encryption is enable
+ * on this session
+ */
+ sess->sign = false;
}
binding_session:
@@ -1522,8 +1540,8 @@ binding_session:
#ifdef CONFIG_SMB_SERVER_KERBEROS5
static int krb5_authenticate(struct ksmbd_work *work)
{
- struct smb2_sess_setup_req *req = work->request_buf;
- struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
char *in_blob, *out_blob;
@@ -1538,8 +1556,7 @@ static int krb5_authenticate(struct ksmbd_work *work)
out_blob = (char *)&rsp->hdr.ProtocolId +
le16_to_cpu(rsp->SecurityBufferOffset);
out_len = work->response_sz -
- offsetof(struct smb2_hdr, smb2_buf_length) -
- le16_to_cpu(rsp->SecurityBufferOffset);
+ (le16_to_cpu(rsp->SecurityBufferOffset) + 4);
/* Check previous session */
prev_sess_id = le64_to_cpu(req->PreviousSessionId);
@@ -1556,14 +1573,13 @@ static int krb5_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
rsp->SecurityBufferLength = cpu_to_le16(out_len);
- inc_rfc1001_len(rsp, out_len - 1);
+ inc_rfc1001_len(work->response_buf, out_len - 1);
if ((conn->sign || server_conf.enforced_signing) ||
(req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
sess->sign = true;
- if ((conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) &&
- conn->ops->generate_encryptionkey) {
+ if (smb3_encryption_negotiated(conn)) {
retval = conn->ops->generate_encryptionkey(sess);
if (retval) {
ksmbd_debug(SMB,
@@ -1612,8 +1628,8 @@ static int krb5_authenticate(struct ksmbd_work *work)
int smb2_sess_setup(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_sess_setup_req *req = work->request_buf;
- struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_session *sess;
struct negotiate_message *negblob;
unsigned int negblob_len, negblob_off;
@@ -1625,7 +1641,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
rsp->SessionFlags = 0;
rsp->SecurityBufferOffset = cpu_to_le16(72);
rsp->SecurityBufferLength = 0;
- inc_rfc1001_len(rsp, 9);
+ inc_rfc1001_len(work->response_buf, 9);
if (!req->hdr.SessionId) {
sess = ksmbd_smb2_session_create();
@@ -1699,9 +1715,11 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off = le16_to_cpu(req->SecurityBufferOffset);
negblob_len = le16_to_cpu(req->SecurityBufferLength);
- if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) ||
- negblob_len < offsetof(struct negotiate_message, NegotiateFlags))
- return -EINVAL;
+ if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) ||
+ negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ rc = -EINVAL;
+ goto out_err;
+ }
negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId +
negblob_off);
@@ -1739,7 +1757,8 @@ int smb2_sess_setup(struct ksmbd_work *work)
* Note: here total size -1 is done as an
* adjustment for 0 size blob
*/
- inc_rfc1001_len(rsp, le16_to_cpu(rsp->SecurityBufferLength) - 1);
+ inc_rfc1001_len(work->response_buf,
+ le16_to_cpu(rsp->SecurityBufferLength) - 1);
} else if (negblob->MessageType == NtLmAuthenticate) {
rc = ntlm_authenticate(work);
@@ -1828,8 +1847,8 @@ out_err:
int smb2_tree_connect(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_tree_connect_req *req = work->request_buf;
- struct smb2_tree_connect_rsp *rsp = work->response_buf;
+ struct smb2_tree_connect_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_tree_connect_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_session *sess = work->sess;
char *treename = NULL, *name = NULL;
struct ksmbd_tree_conn_status status;
@@ -1894,7 +1913,7 @@ out_err1:
rsp->Reserved = 0;
/* default manual caching */
rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
- inc_rfc1001_len(rsp, 16);
+ inc_rfc1001_len(work->response_buf, 16);
if (!IS_ERR(treename))
kfree(treename);
@@ -1999,17 +2018,18 @@ static int smb2_create_open_flags(bool file_present, __le32 access,
*/
int smb2_tree_disconnect(struct ksmbd_work *work)
{
- struct smb2_tree_disconnect_rsp *rsp = work->response_buf;
+ struct smb2_tree_disconnect_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_session *sess = work->sess;
struct ksmbd_tree_connect *tcon = work->tcon;
rsp->StructureSize = cpu_to_le16(4);
- inc_rfc1001_len(rsp, 4);
+ inc_rfc1001_len(work->response_buf, 4);
ksmbd_debug(SMB, "request\n");
if (!tcon) {
- struct smb2_tree_disconnect_req *req = work->request_buf;
+ struct smb2_tree_disconnect_req *req =
+ smb2_get_msg(work->request_buf);
ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
@@ -2031,24 +2051,21 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
int smb2_session_logoff(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_logoff_rsp *rsp = work->response_buf;
+ struct smb2_logoff_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_session *sess = work->sess;
rsp->StructureSize = cpu_to_le16(4);
- inc_rfc1001_len(rsp, 4);
+ inc_rfc1001_len(work->response_buf, 4);
ksmbd_debug(SMB, "request\n");
- /* Got a valid session, set connection state */
- WARN_ON(sess->conn != conn);
-
/* setting CifsExiting here may race with start_tcp_sess */
ksmbd_conn_set_need_reconnect(work);
ksmbd_close_session_fds(work);
ksmbd_conn_wait_idle(conn);
if (ksmbd_tree_conn_session_logoff(sess)) {
- struct smb2_logoff_req *req = work->request_buf;
+ struct smb2_logoff_req *req = smb2_get_msg(work->request_buf);
ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
@@ -2075,8 +2092,8 @@ int smb2_session_logoff(struct ksmbd_work *work)
*/
static noinline int create_smb2_pipe(struct ksmbd_work *work)
{
- struct smb2_create_rsp *rsp = work->response_buf;
- struct smb2_create_req *req = work->request_buf;
+ struct smb2_create_rsp *rsp = smb2_get_msg(work->response_buf);
+ struct smb2_create_req *req = smb2_get_msg(work->request_buf);
int id;
int err;
char *name;
@@ -2099,7 +2116,7 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work)
rsp->hdr.Status = STATUS_SUCCESS;
rsp->StructureSize = cpu_to_le16(89);
rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE;
- rsp->Reserved = 0;
+ rsp->Flags = 0;
rsp->CreateAction = cpu_to_le32(FILE_OPENED);
rsp->CreationTime = cpu_to_le64(0);
@@ -2107,14 +2124,14 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work)
rsp->ChangeTime = cpu_to_le64(0);
rsp->AllocationSize = cpu_to_le64(0);
rsp->EndofFile = cpu_to_le64(0);
- rsp->FileAttributes = ATTR_NORMAL_LE;
+ rsp->FileAttributes = FILE_ATTRIBUTE_NORMAL_LE;
rsp->Reserved2 = 0;
- rsp->VolatileFileId = cpu_to_le64(id);
+ rsp->VolatileFileId = id;
rsp->PersistentFileId = 0;
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
- inc_rfc1001_len(rsp, 88); /* StructureSize - 1*/
+ inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/
kfree(name);
return 0;
@@ -2353,7 +2370,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
struct xattr_dos_attrib da;
int rc;
- fp->f_ci->m_fattr &= ~(ATTR_HIDDEN_LE | ATTR_SYSTEM_LE);
+ fp->f_ci->m_fattr &= ~(FILE_ATTRIBUTE_HIDDEN_LE | FILE_ATTRIBUTE_SYSTEM_LE);
/* get FileAttributes from XATTR_NAME_DOS_ATTRIBUTE */
if (!test_share_config_flag(tcon->share_conf,
@@ -2463,7 +2480,7 @@ int smb2_open(struct ksmbd_work *work)
struct ksmbd_session *sess = work->sess;
struct ksmbd_tree_connect *tcon = work->tcon;
struct smb2_create_req *req;
- struct smb2_create_rsp *rsp, *rsp_org;
+ struct smb2_create_rsp *rsp;
struct path path;
struct ksmbd_share_config *share = tcon->share_conf;
struct ksmbd_file *fp = NULL;
@@ -2489,7 +2506,6 @@ int smb2_open(struct ksmbd_work *work)
umode_t posix_mode = 0;
__le32 daccess, maximal_access = 0;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
if (req->hdr.NextCommand && !work->next_smb2_rcv_hdr_off &&
@@ -2513,8 +2529,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- name = smb2_get_name(share,
- req->Buffer,
+ name = smb2_get_name(req->Buffer,
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
@@ -2559,7 +2574,7 @@ int smb2_open(struct ksmbd_work *work)
if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
lc = parse_lease_state(req);
- if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE_LE)) {
+ if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
pr_err("Invalid impersonationlevel : 0x%x\n",
le32_to_cpu(req->ImpersonationLevel));
rc = -EIO;
@@ -2567,7 +2582,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK)) {
+ if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK_LE)) {
pr_err("Invalid create options : 0x%x\n",
le32_to_cpu(req->CreateOptions));
rc = -EINVAL;
@@ -2609,7 +2624,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- if (req->FileAttributes && !(req->FileAttributes & ATTR_MASK_LE)) {
+ if (req->FileAttributes && !(req->FileAttributes & FILE_ATTRIBUTE_MASK_LE)) {
pr_err("Invalid file attribute : 0x%x\n",
le32_to_cpu(req->FileAttributes));
rc = -EINVAL;
@@ -2670,7 +2685,7 @@ int smb2_open(struct ksmbd_work *work)
(struct create_posix *)context;
if (le16_to_cpu(context->DataOffset) +
le32_to_cpu(context->DataLength) <
- sizeof(struct create_posix)) {
+ sizeof(struct create_posix) - 4) {
rc = -EINVAL;
goto err_out1;
}
@@ -2740,7 +2755,7 @@ int smb2_open(struct ksmbd_work *work)
}
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE &&
- req->FileAttributes & ATTR_NORMAL_LE) {
+ req->FileAttributes & FILE_ATTRIBUTE_NORMAL_LE) {
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
rc = -EIO;
}
@@ -2904,7 +2919,6 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
- fp->filename = name;
fp->cdoption = req->CreateDisposition;
fp->daccess = daccess;
fp->saccess = req->ShareAccess;
@@ -2962,6 +2976,10 @@ int smb2_open(struct ksmbd_work *work)
&pntsd_size, &fattr);
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
+ if (rc) {
+ kfree(pntsd);
+ goto err_out;
+ }
rc = ksmbd_vfs_set_sd_xattr(conn,
user_ns,
@@ -3119,7 +3137,7 @@ int smb2_open(struct ksmbd_work *work)
opinfo = rcu_dereference(fp->f_opinfo);
rsp->OplockLevel = opinfo != NULL ? opinfo->level : 0;
rcu_read_unlock();
- rsp->Reserved = 0;
+ rsp->Flags = 0;
rsp->CreateAction = cpu_to_le32(file_info);
rsp->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
@@ -3135,12 +3153,12 @@ int smb2_open(struct ksmbd_work *work)
rsp->Reserved2 = 0;
- rsp->PersistentFileId = cpu_to_le64(fp->persistent_id);
- rsp->VolatileFileId = cpu_to_le64(fp->volatile_id);
+ rsp->PersistentFileId = fp->persistent_id;
+ rsp->VolatileFileId = fp->volatile_id;
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
- inc_rfc1001_len(rsp_org, 88); /* StructureSize - 1*/
+ inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/
/* If lease is request send lease context response */
if (opinfo && opinfo->is_lease) {
@@ -3155,7 +3173,8 @@ int smb2_open(struct ksmbd_work *work)
create_lease_buf(rsp->Buffer, opinfo->o_lease);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_lease_size);
- inc_rfc1001_len(rsp_org, conn->vals->create_lease_size);
+ inc_rfc1001_len(work->response_buf,
+ conn->vals->create_lease_size);
next_ptr = &lease_ccontext->Next;
next_off = conn->vals->create_lease_size;
}
@@ -3175,7 +3194,8 @@ int smb2_open(struct ksmbd_work *work)
le32_to_cpu(maximal_access));
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_mxac_size);
- inc_rfc1001_len(rsp_org, conn->vals->create_mxac_size);
+ inc_rfc1001_len(work->response_buf,
+ conn->vals->create_mxac_size);
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
next_ptr = &mxac_ccontext->Next;
@@ -3193,7 +3213,8 @@ int smb2_open(struct ksmbd_work *work)
stat.ino, tcon->id);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_disk_id_size);
- inc_rfc1001_len(rsp_org, conn->vals->create_disk_id_size);
+ inc_rfc1001_len(work->response_buf,
+ conn->vals->create_disk_id_size);
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
next_ptr = &disk_id_ccontext->Next;
@@ -3207,15 +3228,15 @@ int smb2_open(struct ksmbd_work *work)
fp);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_posix_size);
- inc_rfc1001_len(rsp_org, conn->vals->create_posix_size);
+ inc_rfc1001_len(work->response_buf,
+ conn->vals->create_posix_size);
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
}
if (contxt_cnt > 0) {
rsp->CreateContextsOffset =
- cpu_to_le32(offsetof(struct smb2_create_rsp, Buffer)
- - 4);
+ cpu_to_le32(offsetof(struct smb2_create_rsp, Buffer));
}
err_out:
@@ -3249,14 +3270,13 @@ err_out1:
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
- if (!fp || !fp->filename)
- kfree(name);
if (fp)
ksmbd_fd_put(work, fp);
smb2_set_err_rsp(work);
ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
}
+ kfree(name);
kfree(lc);
return 0;
@@ -3368,7 +3388,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level)
* @conn: connection instance
* @info_level: smb information level
* @d_info: structure included variables for query dir
- * @user_ns: user namespace
* @ksmbd_kstat: ksmbd wrapper of dirent stat information
*
* if directory has many entries, find first can't read it fully.
@@ -3398,9 +3417,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level);
- next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
- KSMBD_DIR_INFO_ALIGNMENT);
+ struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+ next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
+ d_info->last_entry_off_align = next_entry_offset - struct_sz;
if (next_entry_offset > d_info->out_buf_len) {
d_info->out_buf_len = 0;
@@ -3422,9 +3441,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
ffdinfo->EaSize =
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (ffdinfo->EaSize)
- ffdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ ffdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
- ffdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ ffdinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(ffdinfo->FileName, conv_name, conv_len);
ffdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
break;
@@ -3438,11 +3457,11 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
fbdinfo->EaSize =
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (fbdinfo->EaSize)
- fbdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ fbdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
fbdinfo->ShortNameLength = 0;
fbdinfo->Reserved = 0;
if (d_info->hide_dot_file && d_info->name[0] == '.')
- fbdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ fbdinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(fbdinfo->FileName, conv_name, conv_len);
fbdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
break;
@@ -3454,7 +3473,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
fdinfo = (struct file_directory_info *)kstat;
fdinfo->FileNameLength = cpu_to_le32(conv_len);
if (d_info->hide_dot_file && d_info->name[0] == '.')
- fdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ fdinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(fdinfo->FileName, conv_name, conv_len);
fdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
break;
@@ -3478,11 +3497,11 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
dinfo->EaSize =
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (dinfo->EaSize)
- dinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ dinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
dinfo->Reserved = 0;
dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
if (d_info->hide_dot_file && d_info->name[0] == '.')
- dinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ dinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(dinfo->FileName, conv_name, conv_len);
dinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
break;
@@ -3496,13 +3515,13 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
fibdinfo->EaSize =
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (fibdinfo->EaSize)
- fibdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ fibdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
fibdinfo->ShortNameLength = 0;
fibdinfo->Reserved = 0;
fibdinfo->Reserved2 = cpu_to_le16(0);
if (d_info->hide_dot_file && d_info->name[0] == '.')
- fibdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ fibdinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(fibdinfo->FileName, conv_name, conv_len);
fibdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
break;
@@ -3528,9 +3547,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode);
posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
posix_info->DosAttributes =
- S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE;
+ S_ISDIR(ksmbd_kstat->kstat->mode) ?
+ FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
- posix_info->DosAttributes |= ATTR_HIDDEN_LE;
+ posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid),
SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid),
@@ -3816,7 +3836,7 @@ int smb2_query_dir(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
struct smb2_query_directory_req *req;
- struct smb2_query_directory_rsp *rsp, *rsp_org;
+ struct smb2_query_directory_rsp *rsp;
struct ksmbd_share_config *share = work->tcon->share_conf;
struct ksmbd_file *dir_fp = NULL;
struct ksmbd_dir_info d_info;
@@ -3826,7 +3846,6 @@ int smb2_query_dir(struct ksmbd_work *work)
int buffer_sz;
struct smb2_query_dir_private query_dir_private = {NULL, };
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
if (ksmbd_override_fsids(work)) {
@@ -3841,9 +3860,7 @@ int smb2_query_dir(struct ksmbd_work *work)
goto err_out2;
}
- dir_fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ dir_fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!dir_fp) {
rc = -EBADF;
goto err_out2;
@@ -3877,8 +3894,6 @@ int smb2_query_dir(struct ksmbd_work *work)
ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr);
}
- ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename);
-
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
@@ -3947,16 +3962,17 @@ int smb2_query_dir(struct ksmbd_work *work)
rsp->OutputBufferOffset = cpu_to_le16(0);
rsp->OutputBufferLength = cpu_to_le32(0);
rsp->Buffer[0] = 0;
- inc_rfc1001_len(rsp_org, 9);
+ inc_rfc1001_len(work->response_buf, 9);
} else {
((struct file_directory_info *)
((char *)rsp->Buffer + d_info.last_entry_offset))
->NextEntryOffset = 0;
+ d_info.data_count -= d_info.last_entry_off_align;
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
rsp->OutputBufferLength = cpu_to_le32(d_info.data_count);
- inc_rfc1001_len(rsp_org, 8 + d_info.data_count);
+ inc_rfc1001_len(work->response_buf, 8 + d_info.data_count);
}
kfree(srch_ptr);
@@ -3994,31 +4010,34 @@ err_out2:
* buffer_check_err() - helper function to check buffer errors
* @reqOutputBufferLength: max buffer length expected in command response
* @rsp: query info response buffer contains output buffer length
+ * @rsp_org: base response buffer pointer in case of chained response
* @infoclass_size: query info class response buffer size
*
* Return: 0 on success, otherwise error
*/
static int buffer_check_err(int reqOutputBufferLength,
- struct smb2_query_info_rsp *rsp, int infoclass_size)
+ struct smb2_query_info_rsp *rsp,
+ void *rsp_org, int infoclass_size)
{
if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) {
if (reqOutputBufferLength < infoclass_size) {
pr_err("Invalid Buffer Size Requested\n");
rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH;
- rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4);
+ *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr));
return -EINVAL;
}
ksmbd_debug(SMB, "Buffer Overflow\n");
rsp->hdr.Status = STATUS_BUFFER_OVERFLOW;
- rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4 +
+ *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr) +
reqOutputBufferLength);
rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength);
}
return 0;
}
-static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp)
+static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp,
+ void *rsp_org)
{
struct smb2_file_standard_info *sinfo;
@@ -4031,10 +4050,11 @@ static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp)
sinfo->Directory = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_standard_info));
- inc_rfc1001_len(rsp, sizeof(struct smb2_file_standard_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_standard_info));
}
-static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num)
+static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num,
+ void *rsp_org)
{
struct smb2_file_internal_info *file_info;
@@ -4044,12 +4064,13 @@ static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num)
file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63));
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_internal_info));
- inc_rfc1001_len(rsp, sizeof(struct smb2_file_internal_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info));
}
static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
struct smb2_query_info_req *req,
- struct smb2_query_info_rsp *rsp)
+ struct smb2_query_info_rsp *rsp,
+ void *rsp_org)
{
u64 id;
int rc;
@@ -4058,23 +4079,25 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
* Windows can sometime send query file info request on
* pipe without opening it, checking error condition here
*/
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (!ksmbd_session_rpc_method(sess, id))
return -ENOENT;
ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n",
- req->FileInfoClass, le64_to_cpu(req->VolatileFileId));
+ req->FileInfoClass, req->VolatileFileId);
switch (req->FileInfoClass) {
case FILE_STANDARD_INFORMATION:
- get_standard_info_pipe(rsp);
+ get_standard_info_pipe(rsp, rsp_org);
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, FILE_STANDARD_INFORMATION_SIZE);
+ rsp, rsp_org,
+ FILE_STANDARD_INFORMATION_SIZE);
break;
case FILE_INTERNAL_INFORMATION:
- get_internal_info_pipe(rsp, id);
+ get_internal_info_pipe(rsp, id, rsp_org);
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, FILE_INTERNAL_INFORMATION_SIZE);
+ rsp, rsp_org,
+ FILE_INTERNAL_INFORMATION_SIZE);
break;
default:
ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n",
@@ -4364,9 +4387,9 @@ static int get_file_all_info(struct ksmbd_work *work,
return -EACCES;
}
- filename = convert_to_nt_pathname(fp->filename);
- if (!filename)
- return -ENOMEM;
+ filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
inode = file_inode(fp->filp);
generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
@@ -4450,6 +4473,12 @@ static void get_file_stream_info(struct ksmbd_work *work,
&stat);
file_info = (struct smb2_file_stream_info *)rsp->Buffer;
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ goto out;
+
xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (xattr_list_len < 0) {
goto out;
@@ -4458,12 +4487,6 @@ static void get_file_stream_info(struct ksmbd_work *work,
goto out;
}
- buf_free_len =
- smb2_calc_max_out_buf_len(work, 8,
- le32_to_cpu(req->OutputBufferLength));
- if (buf_free_len < 0)
- goto out;
-
while (idx < xattr_list_len) {
stream_name = xattr_list + idx;
streamlen = strlen(stream_name);
@@ -4489,8 +4512,10 @@ static void get_file_stream_info(struct ksmbd_work *work,
":%s", &stream_name[XATTR_NAME_STREAM_LEN]);
next = sizeof(struct smb2_file_stream_info) + streamlen * 2;
- if (next > buf_free_len)
+ if (next > buf_free_len) {
+ kfree(stream_buf);
break;
+ }
file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
@@ -4507,6 +4532,7 @@ static void get_file_stream_info(struct ksmbd_work *work,
file_info->NextEntryOffset = cpu_to_le32(next);
}
+out:
if (!S_ISDIR(stat.mode) &&
buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) {
file_info = (struct smb2_file_stream_info *)
@@ -4515,14 +4541,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
"::$DATA", 7, conn->local_nls, 0);
streamlen *= 2;
file_info->StreamNameLength = cpu_to_le32(streamlen);
- file_info->StreamSize = 0;
- file_info->StreamAllocationSize = 0;
+ file_info->StreamSize = cpu_to_le64(stat.size);
+ file_info->StreamAllocationSize = cpu_to_le64(stat.blocks << 9);
nbytes += sizeof(struct smb2_file_stream_info) + streamlen;
}
/* last entry offset should be 0 */
file_info->NextEntryOffset = 0;
-out:
kvfree(xattr_list);
rsp->OutputBufferLength = cpu_to_le32(nbytes);
@@ -4688,7 +4713,7 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
static int smb2_get_info_file(struct ksmbd_work *work,
struct smb2_query_info_req *req,
- struct smb2_query_info_rsp *rsp, void *rsp_org)
+ struct smb2_query_info_rsp *rsp)
{
struct ksmbd_file *fp;
int fileinfoclass = 0;
@@ -4699,11 +4724,12 @@ static int smb2_get_info_file(struct ksmbd_work *work,
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_PIPE)) {
/* smb2 info file called for pipe */
- return smb2_get_info_file_pipe(work->sess, req, rsp);
+ return smb2_get_info_file_pipe(work->sess, req, rsp,
+ work->response_buf);
}
if (work->next_smb2_rcv_hdr_off) {
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -4712,8 +4738,8 @@ static int smb2_get_info_file(struct ksmbd_work *work,
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -4724,77 +4750,77 @@ static int smb2_get_info_file(struct ksmbd_work *work,
switch (fileinfoclass) {
case FILE_ACCESS_INFORMATION:
- get_file_access_info(rsp, fp, rsp_org);
+ get_file_access_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE;
break;
case FILE_BASIC_INFORMATION:
- rc = get_file_basic_info(rsp, fp, rsp_org);
+ rc = get_file_basic_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_BASIC_INFORMATION_SIZE;
break;
case FILE_STANDARD_INFORMATION:
- get_file_standard_info(rsp, fp, rsp_org);
+ get_file_standard_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE;
break;
case FILE_ALIGNMENT_INFORMATION:
- get_file_alignment_info(rsp, rsp_org);
+ get_file_alignment_info(rsp, work->response_buf);
file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE;
break;
case FILE_ALL_INFORMATION:
- rc = get_file_all_info(work, rsp, fp, rsp_org);
+ rc = get_file_all_info(work, rsp, fp, work->response_buf);
file_infoclass_size = FILE_ALL_INFORMATION_SIZE;
break;
case FILE_ALTERNATE_NAME_INFORMATION:
- get_file_alternate_info(work, rsp, fp, rsp_org);
+ get_file_alternate_info(work, rsp, fp, work->response_buf);
file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE;
break;
case FILE_STREAM_INFORMATION:
- get_file_stream_info(work, rsp, fp, rsp_org);
+ get_file_stream_info(work, rsp, fp, work->response_buf);
file_infoclass_size = FILE_STREAM_INFORMATION_SIZE;
break;
case FILE_INTERNAL_INFORMATION:
- get_file_internal_info(rsp, fp, rsp_org);
+ get_file_internal_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE;
break;
case FILE_NETWORK_OPEN_INFORMATION:
- rc = get_file_network_open_info(rsp, fp, rsp_org);
+ rc = get_file_network_open_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE;
break;
case FILE_EA_INFORMATION:
- get_file_ea_info(rsp, rsp_org);
+ get_file_ea_info(rsp, work->response_buf);
file_infoclass_size = FILE_EA_INFORMATION_SIZE;
break;
case FILE_FULL_EA_INFORMATION:
- rc = smb2_get_ea(work, fp, req, rsp, rsp_org);
+ rc = smb2_get_ea(work, fp, req, rsp, work->response_buf);
file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE;
break;
case FILE_POSITION_INFORMATION:
- get_file_position_info(rsp, fp, rsp_org);
+ get_file_position_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_POSITION_INFORMATION_SIZE;
break;
case FILE_MODE_INFORMATION:
- get_file_mode_info(rsp, fp, rsp_org);
+ get_file_mode_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_MODE_INFORMATION_SIZE;
break;
case FILE_COMPRESSION_INFORMATION:
- get_file_compression_info(rsp, fp, rsp_org);
+ get_file_compression_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE;
break;
case FILE_ATTRIBUTE_TAG_INFORMATION:
- rc = get_file_attribute_tag_info(rsp, fp, rsp_org);
+ rc = get_file_attribute_tag_info(rsp, fp, work->response_buf);
file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE;
break;
case SMB_FIND_FILE_POSIX_INFO:
@@ -4802,7 +4828,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- rc = find_file_posix_info(rsp, fp, rsp_org);
+ rc = find_file_posix_info(rsp, fp, work->response_buf);
file_infoclass_size = sizeof(struct smb311_posix_qinfo);
}
break;
@@ -4813,7 +4839,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
}
if (!rc)
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp,
+ rsp, work->response_buf,
file_infoclass_size);
ksmbd_fd_put(work, fp);
return rc;
@@ -4821,7 +4847,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
static int smb2_get_info_filesystem(struct ksmbd_work *work,
struct smb2_query_info_req *req,
- struct smb2_query_info_rsp *rsp, void *rsp_org)
+ struct smb2_query_info_rsp *rsp)
{
struct ksmbd_session *sess = work->sess;
struct ksmbd_conn *conn = sess->conn;
@@ -4857,7 +4883,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->DeviceType = cpu_to_le32(stfs.f_type);
info->DeviceCharacteristics = cpu_to_le32(0x00000020);
rsp->OutputBufferLength = cpu_to_le32(8);
- inc_rfc1001_len(rsp_org, 8);
+ inc_rfc1001_len(work->response_buf, 8);
fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE;
break;
}
@@ -4883,7 +4909,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->FileSystemNameLen = cpu_to_le32(len);
sz = sizeof(struct filesystem_attribute_info) - 2 + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
- inc_rfc1001_len(rsp_org, sz);
+ inc_rfc1001_len(work->response_buf, sz);
fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE;
break;
}
@@ -4891,11 +4917,18 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
{
struct filesystem_vol_info *info;
size_t sz;
+ unsigned int serial_crc = 0;
info = (struct filesystem_vol_info *)(rsp->Buffer);
info->VolumeCreationTime = 0;
+ serial_crc = crc32_le(serial_crc, share->name,
+ strlen(share->name));
+ serial_crc = crc32_le(serial_crc, share->path,
+ strlen(share->path));
+ serial_crc = crc32_le(serial_crc, ksmbd_netbios_name(),
+ strlen(ksmbd_netbios_name()));
/* Taking dummy value of serial number*/
- info->SerialNumber = cpu_to_le32(0xbc3ac512);
+ info->SerialNumber = cpu_to_le32(serial_crc);
len = smbConvertToUTF16((__le16 *)info->VolumeLabel,
share->name, PATH_MAX,
conn->local_nls, 0);
@@ -4904,7 +4937,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->Reserved = 0;
sz = sizeof(struct filesystem_vol_info) - 2 + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
- inc_rfc1001_len(rsp_org, sz);
+ inc_rfc1001_len(work->response_buf, sz);
fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE;
break;
}
@@ -4918,7 +4951,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->SectorsPerAllocationUnit = cpu_to_le32(1);
info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
rsp->OutputBufferLength = cpu_to_le32(24);
- inc_rfc1001_len(rsp_org, 24);
+ inc_rfc1001_len(work->response_buf, 24);
fs_infoclass_size = FS_SIZE_INFORMATION_SIZE;
break;
}
@@ -4935,7 +4968,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->SectorsPerAllocationUnit = cpu_to_le32(1);
info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
rsp->OutputBufferLength = cpu_to_le32(32);
- inc_rfc1001_len(rsp_org, 32);
+ inc_rfc1001_len(work->response_buf, 32);
fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE;
break;
}
@@ -4956,28 +4989,30 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->extended_info.rel_date = 0;
memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0"));
rsp->OutputBufferLength = cpu_to_le32(64);
- inc_rfc1001_len(rsp_org, 64);
+ inc_rfc1001_len(work->response_buf, 64);
fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE;
break;
}
case FS_SECTOR_SIZE_INFORMATION:
{
struct smb3_fs_ss_info *info;
+ unsigned int sector_size =
+ min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096);
info = (struct smb3_fs_ss_info *)(rsp->Buffer);
- info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize);
+ info->LogicalBytesPerSector = cpu_to_le32(sector_size);
info->PhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
- info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
+ info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size);
info->FSEffPhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE |
SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE);
info->ByteOffsetForSectorAlignment = 0;
info->ByteOffsetForPartitionAlignment = 0;
rsp->OutputBufferLength = cpu_to_le32(28);
- inc_rfc1001_len(rsp_org, 28);
+ inc_rfc1001_len(work->response_buf, 28);
fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE;
break;
}
@@ -4999,7 +5034,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID);
info->Padding = 0;
rsp->OutputBufferLength = cpu_to_le32(48);
- inc_rfc1001_len(rsp_org, 48);
+ inc_rfc1001_len(work->response_buf, 48);
fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE;
break;
}
@@ -5020,7 +5055,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->TotalFileNodes = cpu_to_le64(stfs.f_files);
info->FreeFileNodes = cpu_to_le64(stfs.f_ffree);
rsp->OutputBufferLength = cpu_to_le32(56);
- inc_rfc1001_len(rsp_org, 56);
+ inc_rfc1001_len(work->response_buf, 56);
fs_infoclass_size = FS_POSIX_INFORMATION_SIZE;
}
break;
@@ -5030,7 +5065,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
return -EOPNOTSUPP;
}
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp,
+ rsp, work->response_buf,
fs_infoclass_size);
path_put(&path);
return rc;
@@ -5038,7 +5073,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
static int smb2_get_info_sec(struct ksmbd_work *work,
struct smb2_query_info_req *req,
- struct smb2_query_info_rsp *rsp, void *rsp_org)
+ struct smb2_query_info_rsp *rsp)
{
struct ksmbd_file *fp;
struct user_namespace *user_ns;
@@ -5053,7 +5088,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO |
PROTECTED_DACL_SECINFO |
UNPROTECTED_DACL_SECINFO)) {
- pr_err("Unsupported addition info: 0x%x)\n",
+ ksmbd_debug(SMB, "Unsupported addition info: 0x%x)\n",
addition_info);
pntsd->revision = cpu_to_le16(1);
@@ -5065,13 +5100,13 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
secdesclen = sizeof(struct smb_ntsd);
rsp->OutputBufferLength = cpu_to_le32(secdesclen);
- inc_rfc1001_len(rsp_org, secdesclen);
+ inc_rfc1001_len(work->response_buf, secdesclen);
return 0;
}
if (work->next_smb2_rcv_hdr_off) {
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -5080,8 +5115,8 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -5107,7 +5142,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
return rc;
rsp->OutputBufferLength = cpu_to_le32(secdesclen);
- inc_rfc1001_len(rsp_org, secdesclen);
+ inc_rfc1001_len(work->response_buf, secdesclen);
return 0;
}
@@ -5120,10 +5155,9 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
int smb2_query_info(struct ksmbd_work *work)
{
struct smb2_query_info_req *req;
- struct smb2_query_info_rsp *rsp, *rsp_org;
+ struct smb2_query_info_rsp *rsp;
int rc = 0;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
ksmbd_debug(SMB, "GOT query info request\n");
@@ -5131,15 +5165,15 @@ int smb2_query_info(struct ksmbd_work *work)
switch (req->InfoType) {
case SMB2_O_INFO_FILE:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
- rc = smb2_get_info_file(work, req, rsp, (void *)rsp_org);
+ rc = smb2_get_info_file(work, req, rsp);
break;
case SMB2_O_INFO_FILESYSTEM:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILESYSTEM\n");
- rc = smb2_get_info_filesystem(work, req, rsp, (void *)rsp_org);
+ rc = smb2_get_info_filesystem(work, req, rsp);
break;
case SMB2_O_INFO_SECURITY:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n");
- rc = smb2_get_info_sec(work, req, rsp, (void *)rsp_org);
+ rc = smb2_get_info_sec(work, req, rsp);
break;
default:
ksmbd_debug(SMB, "InfoType %d not supported yet\n",
@@ -5164,7 +5198,7 @@ int smb2_query_info(struct ksmbd_work *work)
}
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
- inc_rfc1001_len(rsp_org, 8);
+ inc_rfc1001_len(work->response_buf, 8);
return 0;
}
@@ -5177,10 +5211,10 @@ int smb2_query_info(struct ksmbd_work *work)
static noinline int smb2_close_pipe(struct ksmbd_work *work)
{
u64 id;
- struct smb2_close_req *req = work->request_buf;
- struct smb2_close_rsp *rsp = work->response_buf;
+ struct smb2_close_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_close_rsp *rsp = smb2_get_msg(work->response_buf);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
ksmbd_session_rpc_close(work->sess, id);
rsp->StructureSize = cpu_to_le16(60);
@@ -5193,7 +5227,7 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work)
rsp->AllocationSize = 0;
rsp->EndOfFile = 0;
rsp->Attributes = 0;
- inc_rfc1001_len(rsp, 60);
+ inc_rfc1001_len(work->response_buf, 60);
return 0;
}
@@ -5209,14 +5243,12 @@ int smb2_close(struct ksmbd_work *work)
u64 sess_id;
struct smb2_close_req *req;
struct smb2_close_rsp *rsp;
- struct smb2_close_rsp *rsp_org;
struct ksmbd_conn *conn = work->conn;
struct ksmbd_file *fp;
struct inode *inode;
u64 time;
int err = 0;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
if (test_share_config_flag(work->tcon->share_conf,
@@ -5241,7 +5273,7 @@ int smb2_close(struct ksmbd_work *work)
}
if (work->next_smb2_rcv_hdr_off &&
- !has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ !has_file_id(req->VolatileFileId)) {
if (!has_file_id(work->compound_fid)) {
/* file already closed, return FILE_CLOSED */
ksmbd_debug(SMB, "file already closed\n");
@@ -5260,7 +5292,7 @@ int smb2_close(struct ksmbd_work *work)
work->compound_pfid = KSMBD_NO_FID;
}
} else {
- volatile_id = le64_to_cpu(req->VolatileFileId);
+ volatile_id = req->VolatileFileId;
}
ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id);
@@ -5306,7 +5338,7 @@ out:
rsp->hdr.Status = STATUS_FILE_CLOSED;
smb2_set_err_rsp(work);
} else {
- inc_rfc1001_len(rsp_org, 60);
+ inc_rfc1001_len(work->response_buf, 60);
}
return 0;
@@ -5320,11 +5352,11 @@ out:
*/
int smb2_echo(struct ksmbd_work *work)
{
- struct smb2_echo_rsp *rsp = work->response_buf;
+ struct smb2_echo_rsp *rsp = smb2_get_msg(work->response_buf);
rsp->StructureSize = cpu_to_le16(4);
rsp->Reserved = 0;
- inc_rfc1001_len(rsp, 4);
+ inc_rfc1001_len(work->response_buf, 4);
return 0;
}
@@ -5361,8 +5393,7 @@ static int smb2_rename(struct ksmbd_work *work,
goto out;
}
- new_name = smb2_get_name(share,
- file_info->FileName,
+ new_name = smb2_get_name(file_info->FileName,
le32_to_cpu(file_info->FileNameLength),
local_nls);
if (IS_ERR(new_name)) {
@@ -5473,8 +5504,7 @@ static int smb2_create_link(struct ksmbd_work *work,
if (!pathname)
return -ENOMEM;
- link_name = smb2_get_name(share,
- file_info->FileName,
+ link_name = smb2_get_name(file_info->FileName,
le32_to_cpu(file_info->FileNameLength),
local_nls);
if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) {
@@ -5566,14 +5596,14 @@ static int set_file_basic_info(struct ksmbd_file *fp,
if (file_info->Attributes) {
if (!S_ISDIR(inode->i_mode) &&
- file_info->Attributes & ATTR_DIRECTORY_LE) {
+ file_info->Attributes & FILE_ATTRIBUTE_DIRECTORY_LE) {
pr_err("can't change a file to a directory\n");
return -EINVAL;
}
- if (!(S_ISDIR(inode->i_mode) && file_info->Attributes == ATTR_NORMAL_LE))
+ if (!(S_ISDIR(inode->i_mode) && file_info->Attributes == FILE_ATTRIBUTE_NORMAL_LE))
fp->f_ci->m_fattr = file_info->Attributes |
- (fp->f_ci->m_fattr & ATTR_DIRECTORY_LE);
+ (fp->f_ci->m_fattr & FILE_ATTRIBUTE_DIRECTORY_LE);
}
if (test_share_config_flag(share, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS) &&
@@ -5652,8 +5682,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
size = i_size_read(inode);
rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512);
if (rc) {
- pr_err("truncate failed! filename : %s, err %d\n",
- fp->filename, rc);
+ pr_err("truncate failed!, err %d\n", rc);
return rc;
}
if (size < alloc_blks * 512)
@@ -5683,12 +5712,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncated range.
*/
if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
- ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
- fp->filename, newsize);
+ ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
- ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
- fp->filename, rc);
+ ksmbd_debug(SMB, "truncate failed!, err %d\n", rc);
if (rc != -EAGAIN)
rc = -EBADF;
return rc;
@@ -5734,8 +5761,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (parent_fp) {
if (parent_fp->daccess & FILE_DELETE_LE) {
pr_err("parent dir is opened with delete access\n");
+ ksmbd_fd_put(work, parent_fp);
return -ESHARE;
}
+ ksmbd_fd_put(work, parent_fp);
}
next:
return smb2_rename(work, fp, user_ns, rename_info,
@@ -5794,9 +5823,7 @@ static int set_file_mode_info(struct ksmbd_file *fp,
mode = file_info->Mode;
- if ((mode & ~FILE_MODE_INFO_MASK) ||
- (mode & FILE_SYNCHRONOUS_IO_ALERT_LE &&
- mode & FILE_SYNCHRONOUS_IO_NONALERT_LE)) {
+ if ((mode & ~FILE_MODE_INFO_MASK)) {
pr_err("Mode is not valid : 0x%x\n", le32_to_cpu(mode));
return -EINVAL;
}
@@ -5814,7 +5841,7 @@ static int set_file_mode_info(struct ksmbd_file *fp,
* smb2_set_info_file() - handler for smb2 set info command
* @work: smb work containing set info command buffer
* @fp: ksmbd_file pointer
- * @info_class: smb2 set info class
+ * @req: request buffer pointer
* @share: ksmbd_share_config pointer
*
* Return: 0 on success, otherwise error
@@ -5943,31 +5970,30 @@ static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info,
int smb2_set_info(struct ksmbd_work *work)
{
struct smb2_set_info_req *req;
- struct smb2_set_info_rsp *rsp, *rsp_org;
+ struct smb2_set_info_rsp *rsp;
struct ksmbd_file *fp;
int rc = 0;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
ksmbd_debug(SMB, "Received set info request\n");
- rsp_org = work->response_buf;
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
rsp = ksmbd_resp_buf_next(work);
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
pid = work->compound_pfid;
}
} else {
- req = work->request_buf;
- rsp = work->response_buf;
+ req = smb2_get_msg(work->request_buf);
+ rsp = smb2_get_msg(work->response_buf);
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -6002,7 +6028,7 @@ int smb2_set_info(struct ksmbd_work *work)
goto err_out;
rsp->StructureSize = cpu_to_le16(2);
- inc_rfc1001_len(rsp_org, 2);
+ inc_rfc1001_len(work->response_buf, 2);
ksmbd_fd_put(work, fp);
return 0;
@@ -6042,12 +6068,12 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
int nbytes = 0, err;
u64 id;
struct ksmbd_rpc_command *rpc_resp;
- struct smb2_read_req *req = work->request_buf;
- struct smb2_read_rsp *rsp = work->response_buf;
+ struct smb2_read_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_read_rsp *rsp = smb2_get_msg(work->response_buf);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
- inc_rfc1001_len(rsp, 16);
+ inc_rfc1001_len(work->response_buf, 16);
rpc_resp = ksmbd_rpc_read(work->sess, id);
if (rpc_resp) {
if (rpc_resp->flags != KSMBD_RPC_OK) {
@@ -6066,7 +6092,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
rpc_resp->payload_sz);
nbytes = rpc_resp->payload_sz;
- work->resp_hdr_sz = get_rfc1002_len(rsp) + 4;
+ work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4;
work->aux_payload_sz = nbytes;
kvfree(rpc_resp);
}
@@ -6076,8 +6102,8 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
rsp->Reserved = 0;
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = 0;
- rsp->Reserved2 = 0;
- inc_rfc1001_len(rsp, nbytes);
+ rsp->Flags = 0;
+ inc_rfc1001_len(work->response_buf, nbytes);
return 0;
out:
@@ -6087,25 +6113,46 @@ out:
return err;
}
-static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
- struct smb2_read_req *req, void *data_buf,
- size_t length)
+static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
+ struct smb2_buffer_desc_v1 *desc,
+ __le32 Channel,
+ __le16 ChannelInfoOffset,
+ __le16 ChannelInfoLength)
{
- struct smb2_buffer_desc_v1 *desc =
- (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
- int err;
+ unsigned int i, ch_count;
if (work->conn->dialect == SMB30_PROT_ID &&
- req->Channel != SMB2_CHANNEL_RDMA_V1)
+ Channel != SMB2_CHANNEL_RDMA_V1)
return -EINVAL;
- if (req->ReadChannelInfoOffset == 0 ||
- le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc))
+ ch_count = le16_to_cpu(ChannelInfoLength) / sizeof(*desc);
+ if (ksmbd_debug_types & KSMBD_DEBUG_RDMA) {
+ for (i = 0; i < ch_count; i++) {
+ pr_info("RDMA r/w request %#x: token %#x, length %#x\n",
+ i,
+ le32_to_cpu(desc[i].token),
+ le32_to_cpu(desc[i].length));
+ }
+ }
+ if (ch_count != 1) {
+ ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n",
+ ch_count);
return -EINVAL;
+ }
work->need_invalidate_rkey =
- (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
+ (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
work->remote_key = le32_to_cpu(desc->token);
+ return 0;
+}
+
+static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
+ struct smb2_read_req *req, void *data_buf,
+ size_t length)
+{
+ struct smb2_buffer_desc_v1 *desc =
+ (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
+ int err;
err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
le32_to_cpu(desc->token),
@@ -6127,14 +6174,13 @@ int smb2_read(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
struct smb2_read_req *req;
- struct smb2_read_rsp *rsp, *rsp_org;
- struct ksmbd_file *fp;
+ struct smb2_read_rsp *rsp;
+ struct ksmbd_file *fp = NULL;
loff_t offset;
size_t length, mincount;
ssize_t nbytes = 0, remain_bytes = 0;
int err = 0;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
if (test_share_config_flag(work->tcon->share_conf,
@@ -6143,8 +6189,25 @@ int smb2_read(struct ksmbd_work *work)
return smb2_read_pipe(work);
}
- fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
+ req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset);
+
+ if (ch_offset < offsetof(struct smb2_read_req, Buffer)) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = smb2_set_remote_key_for_rdma(work,
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + ch_offset),
+ req->Channel,
+ req->ReadChannelInfoOffset,
+ req->ReadChannelInfoLength);
+ if (err)
+ goto out;
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
err = -ENOENT;
goto out;
@@ -6215,11 +6278,11 @@ int smb2_read(struct ksmbd_work *work)
rsp->Reserved = 0;
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = cpu_to_le32(remain_bytes);
- rsp->Reserved2 = 0;
- inc_rfc1001_len(rsp_org, 16);
- work->resp_hdr_sz = get_rfc1002_len(rsp_org) + 4;
+ rsp->Flags = 0;
+ inc_rfc1001_len(work->response_buf, 16);
+ work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4;
work->aux_payload_sz = nbytes;
- inc_rfc1001_len(rsp_org, nbytes);
+ inc_rfc1001_len(work->response_buf, nbytes);
ksmbd_fd_put(work, fp);
return 0;
@@ -6254,8 +6317,8 @@ out:
*/
static noinline int smb2_write_pipe(struct ksmbd_work *work)
{
- struct smb2_write_req *req = work->request_buf;
- struct smb2_write_rsp *rsp = work->response_buf;
+ struct smb2_write_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_write_rsp *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_rpc_command *rpc_resp;
u64 id = 0;
int err = 0, ret = 0;
@@ -6263,16 +6326,17 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
size_t length;
length = le32_to_cpu(req->Length);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (le16_to_cpu(req->DataOffset) ==
- (offsetof(struct smb2_write_req, Buffer) - 4)) {
+ offsetof(struct smb2_write_req, Buffer)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
- get_rfc1002_len(req));
+ get_rfc1002_len(work->request_buf));
err = -EINVAL;
goto out;
}
@@ -6304,7 +6368,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(length);
rsp->DataRemaining = 0;
rsp->Reserved2 = 0;
- inc_rfc1001_len(rsp, 16);
+ inc_rfc1001_len(work->response_buf, 16);
return 0;
out:
if (err) {
@@ -6327,21 +6391,6 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
- if (work->conn->dialect == SMB30_PROT_ID &&
- req->Channel != SMB2_CHANNEL_RDMA_V1)
- return -EINVAL;
-
- if (req->Length != 0 || req->DataOffset != 0)
- return -EINVAL;
-
- if (req->WriteChannelInfoOffset == 0 ||
- le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc))
- return -EINVAL;
-
- work->need_invalidate_rkey =
- (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
- work->remote_key = le32_to_cpu(desc->token);
-
data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!data_buf)
return -ENOMEM;
@@ -6372,7 +6421,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
int smb2_write(struct ksmbd_work *work)
{
struct smb2_write_req *req;
- struct smb2_write_rsp *rsp, *rsp_org;
+ struct smb2_write_rsp *rsp;
struct ksmbd_file *fp = NULL;
loff_t offset;
size_t length;
@@ -6381,7 +6430,6 @@ int smb2_write(struct ksmbd_work *work)
bool writethrough = false;
int err = 0;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) {
@@ -6389,14 +6437,32 @@ int smb2_write(struct ksmbd_work *work)
return smb2_write_pipe(work);
}
+ if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
+ req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
+ unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset);
+
+ if (req->Length != 0 || req->DataOffset != 0 ||
+ ch_offset < offsetof(struct smb2_write_req, Buffer)) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = smb2_set_remote_key_for_rdma(work,
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + ch_offset),
+ req->Channel,
+ req->WriteChannelInfoOffset,
+ req->WriteChannelInfoLength);
+ if (err)
+ goto out;
+ }
+
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB, "User does not have write permission\n");
err = -EACCES;
goto out;
}
- fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
err = -ENOENT;
goto out;
@@ -6424,13 +6490,14 @@ int smb2_write(struct ksmbd_work *work)
if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
if (le16_to_cpu(req->DataOffset) ==
- (offsetof(struct smb2_write_req, Buffer) - 4)) {
+ offsetof(struct smb2_write_req, Buffer)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
- get_rfc1002_len(req));
+ get_rfc1002_len(work->request_buf));
err = -EINVAL;
goto out;
}
@@ -6468,7 +6535,7 @@ int smb2_write(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = 0;
rsp->Reserved2 = 0;
- inc_rfc1001_len(rsp_org, 16);
+ inc_rfc1001_len(work->response_buf, 16);
ksmbd_fd_put(work, fp);
return 0;
@@ -6502,24 +6569,20 @@ out:
int smb2_flush(struct ksmbd_work *work)
{
struct smb2_flush_req *req;
- struct smb2_flush_rsp *rsp, *rsp_org;
+ struct smb2_flush_rsp *rsp;
int err;
- rsp_org = work->response_buf;
WORK_BUFFERS(work, req, rsp);
- ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n",
- le64_to_cpu(req->VolatileFileId));
+ ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", req->VolatileFileId);
- err = ksmbd_vfs_fsync(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ err = ksmbd_vfs_fsync(work, req->VolatileFileId, req->PersistentFileId);
if (err)
goto out;
rsp->StructureSize = cpu_to_le16(4);
rsp->Reserved = 0;
- inc_rfc1001_len(rsp_org, 4);
+ inc_rfc1001_len(work->response_buf, 4);
return 0;
out:
@@ -6540,10 +6603,9 @@ out:
int smb2_cancel(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_hdr *hdr = work->request_buf;
+ struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
struct smb2_hdr *chdr;
- struct ksmbd_work *cancel_work = NULL;
- int canceled = 0;
+ struct ksmbd_work *cancel_work = NULL, *iter;
struct list_head *command_list;
ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n",
@@ -6553,11 +6615,11 @@ int smb2_cancel(struct ksmbd_work *work)
command_list = &conn->async_requests;
spin_lock(&conn->request_lock);
- list_for_each_entry(cancel_work, command_list,
+ list_for_each_entry(iter, command_list,
async_request_entry) {
- chdr = cancel_work->request_buf;
+ chdr = smb2_get_msg(iter->request_buf);
- if (cancel_work->async_id !=
+ if (iter->async_id !=
le64_to_cpu(hdr->Id.AsyncId))
continue;
@@ -6565,7 +6627,7 @@ int smb2_cancel(struct ksmbd_work *work)
"smb2 with AsyncId %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->Id.AsyncId),
le16_to_cpu(chdr->Command));
- canceled = 1;
+ cancel_work = iter;
break;
}
spin_unlock(&conn->request_lock);
@@ -6573,24 +6635,24 @@ int smb2_cancel(struct ksmbd_work *work)
command_list = &conn->requests;
spin_lock(&conn->request_lock);
- list_for_each_entry(cancel_work, command_list, request_entry) {
- chdr = cancel_work->request_buf;
+ list_for_each_entry(iter, command_list, request_entry) {
+ chdr = smb2_get_msg(iter->request_buf);
if (chdr->MessageId != hdr->MessageId ||
- cancel_work == work)
+ iter == work)
continue;
ksmbd_debug(SMB,
"smb2 with mid %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->MessageId),
le16_to_cpu(chdr->Command));
- canceled = 1;
+ cancel_work = iter;
break;
}
spin_unlock(&conn->request_lock);
}
- if (canceled) {
+ if (cancel_work) {
cancel_work->state = KSMBD_WORK_CANCELLED;
if (cancel_work->cancel_fn)
cancel_work->cancel_fn(cancel_work->cancel_argv);
@@ -6709,8 +6771,8 @@ static inline bool lock_defer_pending(struct file_lock *fl)
*/
int smb2_lock(struct ksmbd_work *work)
{
- struct smb2_lock_req *req = work->request_buf;
- struct smb2_lock_rsp *rsp = work->response_buf;
+ struct smb2_lock_req *req = smb2_get_msg(work->request_buf);
+ struct smb2_lock_rsp *rsp = smb2_get_msg(work->response_buf);
struct smb2_lock_element *lock_ele;
struct ksmbd_file *fp = NULL;
struct file_lock *flock = NULL;
@@ -6728,12 +6790,9 @@ int smb2_lock(struct ksmbd_work *work)
int prior_lock = 0;
ksmbd_debug(SMB, "Received lock request\n");
- fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
- ksmbd_debug(SMB, "Invalid file id for lock : %llu\n",
- le64_to_cpu(req->VolatileFileId));
+ ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", req->VolatileFileId);
err = -ENOENT;
goto out2;
}
@@ -7017,7 +7076,7 @@ skip:
ksmbd_debug(SMB, "successful in taking lock\n");
rsp->hdr.Status = STATUS_SUCCESS;
rsp->Reserved = 0;
- inc_rfc1001_len(rsp, 4);
+ inc_rfc1001_len(work->response_buf, 4);
ksmbd_fd_put(work, fp);
return 0;
@@ -7088,8 +7147,8 @@ static int fsctl_copychunk(struct ksmbd_work *work,
ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
- rsp->VolatileFileId = cpu_to_le64(volatile_id);
- rsp->PersistentFileId = cpu_to_le64(persistent_id);
+ rsp->VolatileFileId = volatile_id;
+ rsp->PersistentFileId = persistent_id;
ci_rsp->ChunksWritten =
cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
ci_rsp->ChunkBytesWritten =
@@ -7210,15 +7269,10 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
struct sockaddr_storage_rsp *sockaddr_storage;
unsigned int flags;
unsigned long long speed;
- struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr;
rtnl_lock();
for_each_netdev(&init_net, netdev) {
- if (out_buf_len <
- nbytes + sizeof(struct network_interface_info_ioctl_rsp)) {
- rtnl_unlock();
- return -ENOSPC;
- }
+ bool ipv4_set = false;
if (netdev->type == ARPHRD_LOOPBACK)
continue;
@@ -7226,12 +7280,20 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
flags = dev_get_flags(netdev);
if (!(flags & IFF_RUNNING))
continue;
+ipv6_retry:
+ if (out_buf_len <
+ nbytes + sizeof(struct network_interface_info_ioctl_rsp)) {
+ rtnl_unlock();
+ return -ENOSPC;
+ }
nii_rsp = (struct network_interface_info_ioctl_rsp *)
&rsp->Buffer[nbytes];
nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex);
nii_rsp->Capability = 0;
+ if (netdev->real_num_tx_queues > 1)
+ nii_rsp->Capability |= cpu_to_le32(RSS_CAPABLE);
if (ksmbd_rdma_capable_netdev(netdev))
nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE);
@@ -7256,8 +7318,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
nii_rsp->SockAddr_Storage;
memset(sockaddr_storage, 0, 128);
- if (conn->peer_addr.ss_family == PF_INET ||
- ipv6_addr_v4mapped(&csin6->sin6_addr)) {
+ if (!ipv4_set) {
struct in_device *idev;
sockaddr_storage->Family = cpu_to_le16(INTERNETWORK);
@@ -7268,6 +7329,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
continue;
sockaddr_storage->addr4.IPv4address =
idev_ipv4_address(idev);
+ nbytes += sizeof(struct network_interface_info_ioctl_rsp);
+ ipv4_set = true;
+ goto ipv6_retry;
} else {
struct inet6_dev *idev6;
struct inet6_ifaddr *ifa;
@@ -7289,9 +7353,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
break;
}
sockaddr_storage->addr6.ScopeId = 0;
+ nbytes += sizeof(struct network_interface_info_ioctl_rsp);
}
-
- nbytes += sizeof(struct network_interface_info_ioctl_rsp);
}
rtnl_unlock();
@@ -7299,8 +7362,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (nii_rsp)
nii_rsp->Next = 0;
- rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
- rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->PersistentFileId = SMB2_NO_FID;
+ rsp->VolatileFileId = SMB2_NO_FID;
return nbytes;
}
@@ -7312,7 +7375,7 @@ static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn,
int ret = 0;
int dialect;
- if (in_buf_len < sizeof(struct validate_negotiate_info_req) +
+ if (in_buf_len < offsetof(struct validate_negotiate_info_req, Dialects) +
le16_to_cpu(neg_req->DialectCount) * sizeof(__le16))
return -EINVAL;
@@ -7435,9 +7498,9 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id,
old_fattr = fp->f_ci->m_fattr;
if (sparse->SetSparse)
- fp->f_ci->m_fattr |= ATTR_SPARSE_FILE_LE;
+ fp->f_ci->m_fattr |= FILE_ATTRIBUTE_SPARSE_FILE_LE;
else
- fp->f_ci->m_fattr &= ~ATTR_SPARSE_FILE_LE;
+ fp->f_ci->m_fattr &= ~FILE_ATTRIBUTE_SPARSE_FILE_LE;
if (fp->f_ci->m_fattr != old_fattr &&
test_share_config_flag(work->tcon->share_conf,
@@ -7467,9 +7530,7 @@ static int fsctl_request_resume_key(struct ksmbd_work *work,
{
struct ksmbd_file *fp;
- fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp)
return -ENOENT;
@@ -7490,35 +7551,34 @@ static int fsctl_request_resume_key(struct ksmbd_work *work,
int smb2_ioctl(struct ksmbd_work *work)
{
struct smb2_ioctl_req *req;
- struct smb2_ioctl_rsp *rsp, *rsp_org;
+ struct smb2_ioctl_rsp *rsp;
unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len;
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
- rsp_org = work->response_buf;
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
rsp = ksmbd_resp_buf_next(work);
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
}
} else {
- req = work->request_buf;
- rsp = work->response_buf;
+ req = smb2_get_msg(work->request_buf);
+ rsp = smb2_get_msg(work->response_buf);
}
if (!has_file_id(id))
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) {
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
goto out;
}
- cnt_code = le32_to_cpu(req->CntCode);
+ cnt_code = le32_to_cpu(req->CtlCode);
ret = smb2_calc_max_out_buf_len(work, 48,
le32_to_cpu(req->MaxOutputResponse));
if (ret < 0) {
@@ -7577,8 +7637,8 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
nbytes = sizeof(struct validate_negotiate_info_rsp);
- rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
- rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->PersistentFileId = SMB2_NO_FID;
+ rsp->VolatileFileId = SMB2_NO_FID;
break;
case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len);
@@ -7624,10 +7684,10 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->PersistentFileId = req->PersistentFileId;
fsctl_copychunk(work,
(struct copychunk_ioctl_req *)&req->Buffer[0],
- le32_to_cpu(req->CntCode),
+ le32_to_cpu(req->CtlCode),
le32_to_cpu(req->InputCount),
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId),
+ req->VolatileFileId,
+ req->PersistentFileId,
rsp);
break;
case FSCTL_SET_SPARSE:
@@ -7778,7 +7838,7 @@ dup_ext_out:
goto out;
}
- rsp->CntCode = cpu_to_le32(cnt_code);
+ rsp->CtlCode = cpu_to_le32(cnt_code);
rsp->InputCount = cpu_to_le32(0);
rsp->InputOffset = cpu_to_le32(112);
rsp->OutputOffset = cpu_to_le32(112);
@@ -7787,7 +7847,7 @@ dup_ext_out:
rsp->Reserved = cpu_to_le16(0);
rsp->Flags = cpu_to_le32(0);
rsp->Reserved2 = cpu_to_le32(0);
- inc_rfc1001_len(rsp_org, 48 + nbytes);
+ inc_rfc1001_len(work->response_buf, 48 + nbytes);
return 0;
@@ -7814,8 +7874,8 @@ out:
*/
static void smb20_oplock_break_ack(struct ksmbd_work *work)
{
- struct smb2_oplock_break *req = work->request_buf;
- struct smb2_oplock_break *rsp = work->response_buf;
+ struct smb2_oplock_break *req = smb2_get_msg(work->request_buf);
+ struct smb2_oplock_break *rsp = smb2_get_msg(work->response_buf);
struct ksmbd_file *fp;
struct oplock_info *opinfo = NULL;
__le32 err = 0;
@@ -7824,8 +7884,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
char req_oplevel = 0, rsp_oplevel = 0;
unsigned int oplock_change_type;
- volatile_id = le64_to_cpu(req->VolatileFid);
- persistent_id = le64_to_cpu(req->PersistentFid);
+ volatile_id = req->VolatileFid;
+ persistent_id = req->PersistentFid;
req_oplevel = req->OplockLevel;
ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n",
volatile_id, persistent_id, req_oplevel);
@@ -7920,9 +7980,9 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->OplockLevel = rsp_oplevel;
rsp->Reserved = 0;
rsp->Reserved2 = 0;
- rsp->VolatileFid = cpu_to_le64(volatile_id);
- rsp->PersistentFid = cpu_to_le64(persistent_id);
- inc_rfc1001_len(rsp, 24);
+ rsp->VolatileFid = volatile_id;
+ rsp->PersistentFid = persistent_id;
+ inc_rfc1001_len(work->response_buf, 24);
return;
err_out:
@@ -7958,8 +8018,8 @@ static int check_lease_state(struct lease *lease, __le32 req_state)
static void smb21_lease_break_ack(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_lease_ack *req = work->request_buf;
- struct smb2_lease_ack *rsp = work->response_buf;
+ struct smb2_lease_ack *req = smb2_get_msg(work->request_buf);
+ struct smb2_lease_ack *rsp = smb2_get_msg(work->response_buf);
struct oplock_info *opinfo;
__le32 err = 0;
int ret = 0;
@@ -8071,7 +8131,7 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
memcpy(rsp->LeaseKey, req->LeaseKey, 16);
rsp->LeaseState = lease_state;
rsp->LeaseDuration = 0;
- inc_rfc1001_len(rsp, 36);
+ inc_rfc1001_len(work->response_buf, 36);
return;
err_out:
@@ -8092,8 +8152,8 @@ err_out:
*/
int smb2_oplock_break(struct ksmbd_work *work)
{
- struct smb2_oplock_break *req = work->request_buf;
- struct smb2_oplock_break *rsp = work->response_buf;
+ struct smb2_oplock_break *req = smb2_get_msg(work->request_buf);
+ struct smb2_oplock_break *rsp = smb2_get_msg(work->response_buf);
switch (le16_to_cpu(req->StructureSize)) {
case OP_BREAK_STRUCT_SIZE_20:
@@ -8120,8 +8180,8 @@ int smb2_oplock_break(struct ksmbd_work *work)
*/
int smb2_notify(struct ksmbd_work *work)
{
- struct smb2_notify_req *req;
- struct smb2_notify_rsp *rsp;
+ struct smb2_change_notify_req *req;
+ struct smb2_change_notify_rsp *rsp;
WORK_BUFFERS(work, req, rsp);
@@ -8145,7 +8205,7 @@ int smb2_notify(struct ksmbd_work *work)
*/
bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command)
{
- struct smb2_hdr *rcv_hdr2 = work->request_buf;
+ struct smb2_hdr *rcv_hdr2 = smb2_get_msg(work->request_buf);
if ((rcv_hdr2->Flags & SMB2_FLAGS_SIGNED) &&
command != SMB2_NEGOTIATE_HE &&
@@ -8164,22 +8224,22 @@ bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command)
*/
int smb2_check_sign_req(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr, *hdr_org;
+ struct smb2_hdr *hdr;
char signature_req[SMB2_SIGNATURE_SIZE];
char signature[SMB2_HMACSHA256_SIZE];
struct kvec iov[1];
size_t len;
- hdr_org = hdr = work->request_buf;
+ hdr = smb2_get_msg(work->request_buf);
if (work->next_smb2_rcv_hdr_off)
hdr = ksmbd_req_buf_next(work);
if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off)
- len = be32_to_cpu(hdr_org->smb2_buf_length);
+ len = get_rfc1002_len(work->request_buf);
else if (hdr->NextCommand)
len = le32_to_cpu(hdr->NextCommand);
else
- len = be32_to_cpu(hdr_org->smb2_buf_length) -
+ len = get_rfc1002_len(work->request_buf) -
work->next_smb2_rcv_hdr_off;
memcpy(signature_req, hdr->Signature, SMB2_SIGNATURE_SIZE);
@@ -8207,25 +8267,26 @@ int smb2_check_sign_req(struct ksmbd_work *work)
*/
void smb2_set_sign_rsp(struct ksmbd_work *work)
{
- struct smb2_hdr *hdr, *hdr_org;
+ struct smb2_hdr *hdr;
struct smb2_hdr *req_hdr;
char signature[SMB2_HMACSHA256_SIZE];
struct kvec iov[2];
size_t len;
int n_vec = 1;
- hdr_org = hdr = work->response_buf;
+ hdr = smb2_get_msg(work->response_buf);
if (work->next_smb2_rsp_hdr_off)
hdr = ksmbd_resp_buf_next(work);
req_hdr = ksmbd_req_buf_next(work);
if (!work->next_smb2_rsp_hdr_off) {
- len = get_rfc1002_len(hdr_org);
+ len = get_rfc1002_len(work->response_buf);
if (req_hdr->NextCommand)
len = ALIGN(len, 8);
} else {
- len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off;
+ len = get_rfc1002_len(work->response_buf) -
+ work->next_smb2_rsp_hdr_off;
len = ALIGN(len, 8);
}
@@ -8261,23 +8322,23 @@ int smb3_check_sign_req(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
char *signing_key;
- struct smb2_hdr *hdr, *hdr_org;
+ struct smb2_hdr *hdr;
struct channel *chann;
char signature_req[SMB2_SIGNATURE_SIZE];
char signature[SMB2_CMACAES_SIZE];
struct kvec iov[1];
size_t len;
- hdr_org = hdr = work->request_buf;
+ hdr = smb2_get_msg(work->request_buf);
if (work->next_smb2_rcv_hdr_off)
hdr = ksmbd_req_buf_next(work);
if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off)
- len = be32_to_cpu(hdr_org->smb2_buf_length);
+ len = get_rfc1002_len(work->request_buf);
else if (hdr->NextCommand)
len = le32_to_cpu(hdr->NextCommand);
else
- len = be32_to_cpu(hdr_org->smb2_buf_length) -
+ len = get_rfc1002_len(work->request_buf) -
work->next_smb2_rcv_hdr_off;
if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
@@ -8318,8 +8379,7 @@ int smb3_check_sign_req(struct ksmbd_work *work)
void smb3_set_sign_rsp(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_hdr *req_hdr;
- struct smb2_hdr *hdr, *hdr_org;
+ struct smb2_hdr *req_hdr, *hdr;
struct channel *chann;
char signature[SMB2_CMACAES_SIZE];
struct kvec iov[2];
@@ -8327,18 +8387,19 @@ void smb3_set_sign_rsp(struct ksmbd_work *work)
size_t len;
char *signing_key;
- hdr_org = hdr = work->response_buf;
+ hdr = smb2_get_msg(work->response_buf);
if (work->next_smb2_rsp_hdr_off)
hdr = ksmbd_resp_buf_next(work);
req_hdr = ksmbd_req_buf_next(work);
if (!work->next_smb2_rsp_hdr_off) {
- len = get_rfc1002_len(hdr_org);
+ len = get_rfc1002_len(work->response_buf);
if (req_hdr->NextCommand)
len = ALIGN(len, 8);
} else {
- len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off;
+ len = get_rfc1002_len(work->response_buf) -
+ work->next_smb2_rsp_hdr_off;
len = ALIGN(len, 8);
}
@@ -8391,7 +8452,7 @@ void smb3_preauth_hash_rsp(struct ksmbd_work *work)
if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE &&
conn->preauth_info)
- ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp,
+ ksmbd_gen_preauth_integrity_hash(conn, work->response_buf,
conn->preauth_info->Preauth_HashValue);
if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && sess) {
@@ -8409,35 +8470,34 @@ void smb3_preauth_hash_rsp(struct ksmbd_work *work)
if (!hash_value)
return;
}
- ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp,
+ ksmbd_gen_preauth_integrity_hash(conn, work->response_buf,
hash_value);
}
}
-static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, char *old_buf,
- __le16 cipher_type)
+static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type)
{
- struct smb2_hdr *hdr = (struct smb2_hdr *)old_buf;
+ struct smb2_transform_hdr *tr_hdr = tr_buf + 4;
+ struct smb2_hdr *hdr = smb2_get_msg(old_buf);
unsigned int orig_len = get_rfc1002_len(old_buf);
- memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
+ /* tr_buf must be cleared by the caller */
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
- tr_hdr->Flags = cpu_to_le16(0x01);
+ tr_hdr->Flags = cpu_to_le16(TRANSFORM_FLAG_ENCRYPTED);
if (cipher_type == SMB2_ENCRYPTION_AES128_GCM ||
cipher_type == SMB2_ENCRYPTION_AES256_GCM)
get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
else
get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
memcpy(&tr_hdr->SessionId, &hdr->SessionId, 8);
- inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4);
- inc_rfc1001_len(tr_hdr, orig_len);
+ inc_rfc1001_len(tr_buf, sizeof(struct smb2_transform_hdr));
+ inc_rfc1001_len(tr_buf, orig_len);
}
int smb3_encrypt_resp(struct ksmbd_work *work)
{
char *buf = work->response_buf;
- struct smb2_transform_hdr *tr_hdr;
struct kvec iov[3];
int rc = -ENOMEM;
int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0);
@@ -8445,15 +8505,15 @@ int smb3_encrypt_resp(struct ksmbd_work *work)
if (ARRAY_SIZE(iov) < rq_nvec)
return -ENOMEM;
- tr_hdr = kzalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
- if (!tr_hdr)
+ work->tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL);
+ if (!work->tr_buf)
return rc;
/* fill transform header */
- fill_transform_hdr(tr_hdr, buf, work->conn->cipher_type);
+ fill_transform_hdr(work->tr_buf, buf, work->conn->cipher_type);
- iov[0].iov_base = tr_hdr;
- iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+ iov[0].iov_base = work->tr_buf;
+ iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
buf_size += iov[0].iov_len - 4;
iov[1].iov_base = buf + 4;
@@ -8473,15 +8533,14 @@ int smb3_encrypt_resp(struct ksmbd_work *work)
return rc;
memmove(buf, iov[1].iov_base, iov[1].iov_len);
- tr_hdr->smb2_buf_length = cpu_to_be32(buf_size);
- work->tr_buf = tr_hdr;
+ *(__be32 *)work->tr_buf = cpu_to_be32(buf_size);
return rc;
}
bool smb3_is_transform_hdr(void *buf)
{
- struct smb2_transform_hdr *trhdr = buf;
+ struct smb2_transform_hdr *trhdr = smb2_get_msg(buf);
return trhdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM;
}
@@ -8491,12 +8550,10 @@ int smb3_decrypt_req(struct ksmbd_work *work)
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess;
char *buf = work->request_buf;
- struct smb2_hdr *hdr;
unsigned int pdu_length = get_rfc1002_len(buf);
struct kvec iov[2];
- int buf_data_size = pdu_length + 4 -
- sizeof(struct smb2_transform_hdr);
- struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
+ int buf_data_size = pdu_length - sizeof(struct smb2_transform_hdr);
+ struct smb2_transform_hdr *tr_hdr = smb2_get_msg(buf);
int rc = 0;
if (buf_data_size < sizeof(struct smb2_hdr)) {
@@ -8518,16 +8575,15 @@ int smb3_decrypt_req(struct ksmbd_work *work)
}
iov[0].iov_base = buf;
- iov[0].iov_len = sizeof(struct smb2_transform_hdr);
- iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
+ iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
+ iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_len = buf_data_size;
rc = ksmbd_crypt_message(conn, iov, 2, 0);
if (rc)
return rc;
memmove(buf + 4, iov[1].iov_base, buf_data_size);
- hdr = (struct smb2_hdr *)buf;
- hdr->smb2_buf_length = cpu_to_be32(buf_data_size);
+ *(__be32 *)buf = cpu_to_be32(buf_data_size);
return rc;
}
@@ -8535,7 +8591,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_hdr *rsp = work->response_buf;
+ struct smb2_hdr *rsp = smb2_get_msg(work->response_buf);
if (conn->dialect < SMB30_PROT_ID)
return false;
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index ff5a2f01d34a..af455278d005 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -10,185 +10,24 @@
#include "ntlmssp.h"
#include "smbacl.h"
-/*
- * Note that, due to trying to use names similar to the protocol specifications,
- * there are many mixed case field names in the structures below. Although
- * this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
- *
- * SMB2 commands
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- * (ie no useful data other than the SMB error code itself) and are marked such.
- * Knowing this helps avoid response buffer allocations and copy in some cases.
- */
-
-/* List of commands in host endian */
-#define SMB2_NEGOTIATE_HE 0x0000
-#define SMB2_SESSION_SETUP_HE 0x0001
-#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
-#define SMB2_TREE_CONNECT_HE 0x0003
-#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
-#define SMB2_CREATE_HE 0x0005
-#define SMB2_CLOSE_HE 0x0006
-#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
-#define SMB2_READ_HE 0x0008
-#define SMB2_WRITE_HE 0x0009
-#define SMB2_LOCK_HE 0x000A
-#define SMB2_IOCTL_HE 0x000B
-#define SMB2_CANCEL_HE 0x000C
-#define SMB2_ECHO_HE 0x000D
-#define SMB2_QUERY_DIRECTORY_HE 0x000E
-#define SMB2_CHANGE_NOTIFY_HE 0x000F
-#define SMB2_QUERY_INFO_HE 0x0010
-#define SMB2_SET_INFO_HE 0x0011
-#define SMB2_OPLOCK_BREAK_HE 0x0012
-
-/* The same list in little endian */
-#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
-#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
-#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
-#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
-#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
-#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
-#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
-#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
-#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
-#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
-#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
-#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
-#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
-#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
-#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
-#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
-#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
-#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
-#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
-
/*Create Action Flags*/
#define FILE_SUPERSEDED 0x00000000
#define FILE_OPENED 0x00000001
#define FILE_CREATED 0x00000002
#define FILE_OVERWRITTEN 0x00000003
-/*
- * Size of the session key (crypto key encrypted with the password
- */
-#define SMB2_NTLMV2_SESSKEY_SIZE 16
-#define SMB2_SIGNATURE_SIZE 16
-#define SMB2_HMACSHA256_SIZE 32
-#define SMB2_CMACAES_SIZE 16
-#define SMB3_GCM128_CRYPTKEY_SIZE 16
-#define SMB3_GCM256_CRYPTKEY_SIZE 32
-
-/*
- * Size of the smb3 encryption/decryption keys
- */
-#define SMB3_ENC_DEC_KEY_SIZE 32
-
-/*
- * Size of the smb3 signing key
- */
-#define SMB3_SIGN_KEY_SIZE 16
-
-#define CIFS_CLIENT_CHALLENGE_SIZE 8
-#define SMB_SERVER_CHALLENGE_SIZE 8
-
/* SMB2 Max Credits */
#define SMB2_MAX_CREDITS 8192
-#define SMB2_CLIENT_GUID_SIZE 16
-#define SMB2_CREATE_GUID_SIZE 16
-
-/* Maximum buffer size value we can send with 1 credit */
-#define SMB2_MAX_BUFFER_SIZE 65536
-
-#define NUMBER_OF_SMB2_COMMANDS 0x0013
-
/* BB FIXME - analyze following length BB */
#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
-#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* 'B''M''S' */
-#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-
#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
-#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
#define SMB3_MIN_IOSIZE (64 * 1024)
#define SMB3_MAX_IOSIZE (8 * 1024 * 1024)
/*
- * SMB2 Header Definition
- *
- * "MBZ" : Must be Zero
- * "BB" : BugBug, Something to check/review/analyze later
- * "PDU" : "Protocol Data Unit" (ie a network "frame")
- *
- */
-
-#define __SMB2_HEADER_STRUCTURE_SIZE 64
-#define SMB2_HEADER_STRUCTURE_SIZE \
- cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
-
-struct smb2_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /*
- * length is only two or three bytes - with
- * one or two byte type preceding it that MBZ
- */
- __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
- __le16 StructureSize; /* 64 */
- __le16 CreditCharge; /* MBZ */
- __le32 Status; /* Error from server */
- __le16 Command;
- __le16 CreditRequest; /* CreditResponse */
- __le32 Flags;
- __le32 NextCommand;
- __le64 MessageId;
- union {
- struct {
- __le32 ProcessId;
- __le32 TreeId;
- } __packed SyncId;
- __le64 AsyncId;
- } __packed Id;
- __le64 SessionId;
- __u8 Signature[16];
-} __packed;
-
-struct smb2_pdu {
- struct smb2_hdr hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
-#define SMB3_AES_CCM_NONCE 11
-#define SMB3_AES_GCM_NONCE 12
-
-struct smb2_transform_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /*
- * length is only two or three bytes - with
- * one or two byte type preceding it that MBZ
- */
- __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
- __u8 Signature[16];
- __u8 Nonce[16];
- __le32 OriginalMessageSize;
- __u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm */
- __le64 SessionId;
-} __packed;
-
-/*
- * SMB2 flag definitions
- */
-#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
-#define SMB2_FLAGS_REPLAY_OPERATIONS cpu_to_le32(0x20000000)
-
-/*
* Definitions for SMB2 Protocol Data Units (network frames)
*
* See MS-SMB2.PDF specification for protocol details.
@@ -197,437 +36,30 @@ struct smb2_transform_hdr {
*
*/
-#define SMB2_ERROR_STRUCTURE_SIZE2 9
-#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2)
-
-struct smb2_err_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize;
- __u8 ErrorContextCount;
- __u8 Reserved;
- __le32 ByteCount; /* even if zero, at least one byte follows */
- __u8 ErrorData[1]; /* variable length */
-} __packed;
-
-struct smb2_negotiate_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 DialectCount;
- __le16 SecurityMode;
- __le16 Reserved; /* MBZ */
- __le32 Capabilities;
- __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
- /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
- __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
- __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
- __le16 Reserved2;
- __le16 Dialects[1]; /* One dialect (vers=) at a time for now */
-} __packed;
-
-/* SecurityMode flags */
-#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001)
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
-/* Capabilities flags */
-#define SMB2_GLOBAL_CAP_DFS 0x00000001
-#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
-/* Internal types */
-#define SMB2_NT_FIND 0x00100000
-#define SMB2_LARGE_FILES 0x00200000
-
-#define SMB311_SALT_SIZE 32
-/* Hash Algorithm Types */
-#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
-
-#define PREAUTH_HASHVALUE_SIZE 64
-
struct preauth_integrity_info {
/* PreAuth integrity Hash ID */
__le16 Preauth_HashId;
/* PreAuth integrity Hash Value */
- __u8 Preauth_HashValue[PREAUTH_HASHVALUE_SIZE];
+ __u8 Preauth_HashValue[SMB2_PREAUTH_HASH_SIZE];
};
-/* offset is sizeof smb2_negotiate_rsp - 4 but rounded up to 8 bytes. */
+/* offset is sizeof smb2_negotiate_rsp but rounded up to 8 bytes. */
#ifdef CONFIG_SMB_SERVER_KERBEROS5
-/* sizeof(struct smb2_negotiate_rsp) - 4 =
+/* sizeof(struct smb2_negotiate_rsp) =
* header(64) + response(64) + GSS_LENGTH(96) + GSS_PADDING(0)
*/
#define OFFSET_OF_NEG_CONTEXT 0xe0
#else
-/* sizeof(struct smb2_negotiate_rsp) - 4 =
+/* sizeof(struct smb2_negotiate_rsp) =
* header(64) + response(64) + GSS_LENGTH(74) + GSS_PADDING(6)
*/
#define OFFSET_OF_NEG_CONTEXT 0xd0
#endif
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
-#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
-#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
-#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
-
-struct smb2_neg_context {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- /* Followed by array of data */
-} __packed;
-
-struct smb2_preauth_neg_context {
- __le16 ContextType; /* 1 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 HashAlgorithmCount; /* 1 */
- __le16 SaltLength;
- __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
- __u8 Salt[SMB311_SALT_SIZE];
-} __packed;
-
-/* Encryption Algorithms Ciphers */
-#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
-#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
-#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
-#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
-
-struct smb2_encryption_neg_context {
- __le16 ContextType; /* 2 */
- __le16 DataLength;
- __le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
- __le16 CipherCount; /* AES-128-GCM and AES-128-CCM by default */
- __le16 Ciphers[];
-} __packed;
-
-#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
-#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
-#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
-#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
-
-struct smb2_compression_ctx {
- __le16 ContextType; /* 3 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 CompressionAlgorithmCount;
- __u16 Padding;
- __le32 Reserved1;
- __le16 CompressionAlgorithms[];
-} __packed;
-
-#define POSIX_CTXT_DATA_LEN 16
-struct smb2_posix_neg_context {
- __le16 ContextType; /* 0x100 */
- __le16 DataLength;
- __le32 Reserved;
- __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
-} __packed;
-
-struct smb2_netname_neg_context {
- __le16 ContextType; /* 0x100 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 NetName[]; /* hostname of target converted to UCS-2 */
-} __packed;
-
-/* Signing algorithms */
-#define SIGNING_ALG_HMAC_SHA256 cpu_to_le16(0)
-#define SIGNING_ALG_AES_CMAC cpu_to_le16(1)
-#define SIGNING_ALG_AES_GMAC cpu_to_le16(2)
-
-struct smb2_signing_capabilities {
- __le16 ContextType; /* 8 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 SigningAlgorithmCount;
- __le16 SigningAlgorithms[];
-} __packed;
-
-struct smb2_negotiate_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 65 */
- __le16 SecurityMode;
- __le16 DialectRevision;
- __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
- __u8 ServerGUID[16];
- __le32 Capabilities;
- __le32 MaxTransactSize;
- __le32 MaxReadSize;
- __le32 MaxWriteSize;
- __le64 SystemTime; /* MBZ */
- __le64 ServerStartTime;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
-
#define SMB2_SESSION_EXPIRED (0)
#define SMB2_SESSION_IN_PROGRESS BIT(0)
#define SMB2_SESSION_VALID BIT(1)
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
-
-struct smb2_sess_setup_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 25 */
- __u8 Flags;
- __u8 SecurityMode;
- __le32 Capabilities;
- __le32 Channel;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __le64 PreviousSessionId;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Flags/Reserved for SMB3.1.1 */
-#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001
-
-/* Currently defined SessionFlags */
-#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001)
-#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002)
-#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
-struct smb2_sess_setup_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 SessionFlags;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-struct smb2_logoff_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_logoff_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_tree_connect_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 Reserved; /* Flags in SMB3.1.1 */
- __le16 PathOffset;
- __le16 PathLength;
- __u8 Buffer[1]; /* variable length */
-} __packed;
-
-struct smb2_tree_connect_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 16 */
- __u8 ShareType; /* see below */
- __u8 Reserved;
- __le32 ShareFlags; /* see below */
- __le32 Capabilities; /* see below */
- __le32 MaximalAccess;
-} __packed;
-
-/* Possible ShareType values */
-#define SMB2_SHARE_TYPE_DISK 0x01
-#define SMB2_SHARE_TYPE_PIPE 0x02
-#define SMB2_SHARE_TYPE_PRINT 0x03
-
-/*
- * Possible ShareFlags - exactly one and only one of the first 4 caching flags
- * must be set (any of the remaining, SHI1005, flags may be set individually
- * or in combination.
- */
-#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
-#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
-#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
-#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
-#define SHI1005_FLAGS_DFS 0x00000001
-#define SHI1005_FLAGS_DFS_ROOT 0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH 0x00002000
-
-/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008)
-
-struct smb2_tree_disconnect_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_tree_disconnect_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY)
-#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN)
-#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM)
-#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY)
-#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE)
-#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL)
-#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY)
-#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE)
-#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE)
-#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED)
-#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE)
-#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED)
-#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED)
-#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000)
-#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
-#define ATTR_MASK_LE cpu_to_le32(0x00007FB7)
-
-/* Oplock levels */
-#define SMB2_OPLOCK_LEVEL_NONE 0x00
-#define SMB2_OPLOCK_LEVEL_II 0x01
-#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
-#define SMB2_OPLOCK_LEVEL_BATCH 0x09
-#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
-/* Non-spec internal type */
-#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
-
-/* Desired Access Flags */
-#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
-#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001)
-#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
-#define FILE_ADD_FILE_LE cpu_to_le32(0x00000002)
-#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
-#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004)
-#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
-#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
-#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
-#define FILE_TRAVERSE_LE cpu_to_le32(0x00000020)
-#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040)
-#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
-#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
-#define FILE_DELETE_LE cpu_to_le32(0x00010000)
-#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
-#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
-#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
-#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
-#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
-#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
-#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
-#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
-#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
-#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
-#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF)
-
-/* ShareAccess Flags */
-#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
-#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
-#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
-#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
-
-/* CreateDisposition Flags */
-#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
-#define FILE_OPEN_LE cpu_to_le32(0x00000001)
-#define FILE_CREATE_LE cpu_to_le32(0x00000002)
-#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
-#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
-#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
-#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007)
-
-#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \
- FILE_READ_EA_LE | \
- FILE_GENERIC_READ_LE)
-#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \
- FILE_APPEND_DATA_LE | \
- FILE_WRITE_EA_LE | \
- FILE_WRITE_ATTRIBUTES_LE | \
- FILE_GENERIC_WRITE_LE)
-
-/* Impersonation Levels */
-#define IL_ANONYMOUS_LE cpu_to_le32(0x00000000)
-#define IL_IDENTIFICATION_LE cpu_to_le32(0x00000001)
-#define IL_IMPERSONATION_LE cpu_to_le32(0x00000002)
-#define IL_DELEGATE_LE cpu_to_le32(0x00000003)
-
-/* Create Context Values */
-#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
-#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AlSi"
-#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
-#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
-#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
-#define SMB2_CREATE_REQUEST_LEASE "RqLs"
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74"
- #define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10"
-#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
-#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
-
-struct smb2_create_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 57 */
- __u8 SecurityFlags;
- __u8 RequestedOplockLevel;
- __le32 ImpersonationLevel;
- __le64 SmbCreateFlags;
- __le64 Reserved;
- __le32 DesiredAccess;
- __le32 FileAttributes;
- __le32 ShareAccess;
- __le32 CreateDisposition;
- __le32 CreateOptions;
- __le16 NameOffset;
- __le16 NameLength;
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[0];
-} __packed;
-
-struct smb2_create_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 89 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 CreateAction;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize;
- __le64 EndofFile;
- __le32 FileAttributes;
- __le32 Reserved2;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[1];
-} __packed;
-
-struct create_context {
- __le32 Next;
- __le16 NameOffset;
- __le16 NameLength;
- __le16 Reserved;
- __le16 DataOffset;
- __le32 DataLength;
- __u8 Buffer[0];
-} __packed;
-
struct create_durable_req_v2 {
struct create_context ccontext;
__u8 Name[8];
@@ -643,8 +75,8 @@ struct create_durable_reconn_req {
union {
__u8 Reserved[16];
struct {
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
} Fid;
} Data;
} __packed;
@@ -653,8 +85,8 @@ struct create_durable_reconn_v2_req {
struct create_context ccontext;
__u8 Name[8];
struct {
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
} Fid;
__u8 CreateGuid[16];
__le32 Flags;
@@ -688,13 +120,6 @@ struct create_alloc_size_req {
__le64 AllocationSize;
} __packed;
-struct create_posix {
- struct create_context ccontext;
- __u8 Name[16];
- __le32 Mode;
- __u32 Reserved;
-} __packed;
-
struct create_durable_rsp {
struct create_context ccontext;
__u8 Name[8];
@@ -736,213 +161,14 @@ struct create_posix_rsp {
u8 SidBuffer[40];
} __packed;
-#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
-
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
-
-struct lease_context {
- __le64 LeaseKeyLow;
- __le64 LeaseKeyHigh;
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
-} __packed;
-
-struct lease_context_v2 {
- __le64 LeaseKeyLow;
- __le64 LeaseKeyHigh;
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
- __le64 ParentLeaseKeyLow;
- __le64 ParentLeaseKeyHigh;
- __le16 Epoch;
- __le16 Reserved;
-} __packed;
-
-struct create_lease {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context lcontext;
-} __packed;
-
-struct create_lease_v2 {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context_v2 lcontext;
- __u8 Pad[4];
-} __packed;
-
-/* Currently defined values for close flags */
-#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
-struct smb2_close_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Flags;
- __le32 Reserved;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
-} __packed;
-
-struct smb2_close_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* 60 */
- __le16 Flags;
- __le32 Reserved;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile;
- __le32 Attributes;
-} __packed;
-
-struct smb2_flush_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Reserved1;
- __le32 Reserved2;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
-} __packed;
-
-struct smb2_flush_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize;
- __le16 Reserved;
-} __packed;
-
struct smb2_buffer_desc_v1 {
__le64 offset;
__le32 token;
__le32 length;
} __packed;
-#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
-#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001)
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
-
-struct smb2_read_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 49 */
- __u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Reserved;
- __le32 Length;
- __le64 Offset;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 MinimumCount;
- __le32 Channel; /* Reserved MBZ */
- __le32 RemainingBytes;
- __le16 ReadChannelInfoOffset; /* Reserved MBZ */
- __le16 ReadChannelInfoLength; /* Reserved MBZ */
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_read_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-/* For write request Flags field below the following flag is defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
-
-struct smb2_write_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 DataOffset; /* offset from start of SMB2 header to write data */
- __le32 Length;
- __le64 Offset;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 Channel; /* Reserved MBZ */
- __le32 RemainingBytes;
- __le16 WriteChannelInfoOffset; /* Reserved MBZ */
- __le16 WriteChannelInfoLength; /* Reserved MBZ */
- __le32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_write_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
-
-struct smb2_ioctl_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 57 */
- __le16 Reserved; /* offset from start of SMB2 header to write data */
- __le32 CntCode;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 InputOffset; /* Reserved MBZ */
- __le32 InputCount;
- __le32 MaxInputResponse;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 MaxOutputResponse;
- __le32 Flags;
- __le32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_ioctl_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 Reserved; /* offset from start of SMB2 header to write data */
- __le32 CntCode;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 InputOffset; /* Reserved MBZ */
- __le32 InputCount;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 Flags;
- __le32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-struct validate_negotiate_info_req {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 DialectCount;
- __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
-} __packed;
-
-struct validate_negotiate_info_rsp {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 Dialect; /* Dialect in use for the connection */
-} __packed;
-
struct smb_sockaddr_in {
__be16 Port;
__be32 IPv4address;
@@ -987,7 +213,7 @@ struct file_object_buf_type1_ioctl_rsp {
} __packed;
struct resume_key_ioctl_rsp {
- __le64 ResumeKey[3];
+ __u64 ResumeKey[3];
__le32 ContextLength;
__u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */
} __packed;
@@ -1016,204 +242,6 @@ struct file_sparse {
__u8 SetSparse;
} __packed;
-struct file_zero_data_information {
- __le64 FileOffset;
- __le64 BeyondFinalZero;
-} __packed;
-
-struct file_allocated_range_buffer {
- __le64 file_offset;
- __le64 length;
-} __packed;
-
-struct reparse_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-/* Completion Filter flags for Notify */
-#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
-#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
-#define FILE_NOTIFY_CHANGE_NAME 0x00000003
-#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
-#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
-#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
-#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
-#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
-#define FILE_NOTIFY_CHANGE_EA 0x00000080
-#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
-#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
-#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
-#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
-
-/* Flags */
-#define SMB2_WATCH_TREE 0x0001
-
-struct smb2_notify_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 32 */
- __le16 Flags;
- __le32 OutputBufferLength;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __u32 CompletionFileter;
- __u32 Reserved;
-} __packed;
-
-struct smb2_notify_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/* SMB2 Notify Action Flags */
-#define FILE_ACTION_ADDED 0x00000001
-#define FILE_ACTION_REMOVED 0x00000002
-#define FILE_ACTION_MODIFIED 0x00000003
-#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
-#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
-#define FILE_ACTION_ADDED_STREAM 0x00000006
-#define FILE_ACTION_REMOVED_STREAM 0x00000007
-#define FILE_ACTION_MODIFIED_STREAM 0x00000008
-#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
-
-#define SMB2_LOCKFLAG_SHARED 0x0001
-#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002
-#define SMB2_LOCKFLAG_UNLOCK 0x0004
-#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
-#define SMB2_LOCKFLAG_MASK 0x0007
-
-struct smb2_lock_element {
- __le64 Offset;
- __le64 Length;
- __le32 Flags;
- __le32 Reserved;
-} __packed;
-
-struct smb2_lock_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 48 */
- __le16 LockCount;
- __le32 Reserved;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- /* Followed by at least one */
- struct smb2_lock_element locks[1];
-} __packed;
-
-struct smb2_lock_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_echo_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-struct smb2_echo_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-/* search (query_directory) Flags field */
-#define SMB2_RESTART_SCANS 0x01
-#define SMB2_RETURN_SINGLE_ENTRY 0x02
-#define SMB2_INDEX_SPECIFIED 0x04
-#define SMB2_REOPEN 0x10
-
-struct smb2_query_directory_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 FileInformationClass;
- __u8 Flags;
- __le32 FileIndex;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le16 FileNameOffset;
- __le16 FileNameLength;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_directory_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Possible InfoType values */
-#define SMB2_O_INFO_FILE 0x01
-#define SMB2_O_INFO_FILESYSTEM 0x02
-#define SMB2_O_INFO_SECURITY 0x03
-#define SMB2_O_INFO_QUOTA 0x04
-
-/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
-#define OWNER_SECINFO 0x00000001
-#define GROUP_SECINFO 0x00000002
-#define DACL_SECINFO 0x00000004
-#define SACL_SECINFO 0x00000008
-#define LABEL_SECINFO 0x00000010
-#define ATTRIBUTE_SECINFO 0x00000020
-#define SCOPE_SECINFO 0x00000040
-#define BACKUP_SECINFO 0x00010000
-#define UNPROTECTED_SACL_SECINFO 0x10000000
-#define UNPROTECTED_DACL_SECINFO 0x20000000
-#define PROTECTED_SACL_SECINFO 0x40000000
-#define PROTECTED_DACL_SECINFO 0x80000000
-
-struct smb2_query_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 41 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 OutputBufferLength;
- __le16 InputBufferOffset;
- __u16 Reserved;
- __le32 InputBufferLength;
- __le32 AdditionalInformation;
- __le32 Flags;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 BufferLength;
- __le16 BufferOffset;
- __u16 Reserved;
- __le32 AdditionalInformation;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 2 */
-} __packed;
-
/* FILE Info response size */
#define FILE_DIRECTORY_INFORMATION_SIZE 1
#define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2
@@ -1269,145 +297,11 @@ struct fs_type_info {
long magic_number;
} __packed;
-struct smb2_oplock_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __le64 PersistentFid;
- __le64 VolatileFid;
-} __packed;
-
-#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
-
-struct smb2_lease_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 44 */
- __le16 Epoch;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 CurrentLeaseState;
- __le32 NewLeaseState;
- __le32 BreakReason;
- __le32 AccessMaskHint;
- __le32 ShareMaskHint;
-} __packed;
-
-struct smb2_lease_ack {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 Reserved;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 LeaseState;
- __le64 LeaseDuration;
-} __packed;
-
/*
- * PDU infolevel structure definitions
+ * PDU query infolevel structure definitions
* BB consider moving to a different header
*/
-/* File System Information Classes */
-#define FS_VOLUME_INFORMATION 1 /* Query */
-#define FS_LABEL_INFORMATION 2 /* Set */
-#define FS_SIZE_INFORMATION 3 /* Query */
-#define FS_DEVICE_INFORMATION 4 /* Query */
-#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
-#define FS_CONTROL_INFORMATION 6 /* Query, Set */
-#define FS_FULL_SIZE_INFORMATION 7 /* Query */
-#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
-#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
-#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
-
-struct smb2_fs_full_size_info {
- __le64 TotalAllocationUnits;
- __le64 CallerAvailableAllocationUnits;
- __le64 ActualAvailableAllocationUnits;
- __le32 SectorsPerAllocationUnit;
- __le32 BytesPerSector;
-} __packed;
-
-#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
-#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
-#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
-#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
-
-/* sector size info struct */
-struct smb3_fs_ss_info {
- __le32 LogicalBytesPerSector;
- __le32 PhysicalBytesPerSectorForAtomicity;
- __le32 PhysicalBytesPerSectorForPerf;
- __le32 FSEffPhysicalBytesPerSectorForAtomicity;
- __le32 Flags;
- __le32 ByteOffsetForSectorAlignment;
- __le32 ByteOffsetForPartitionAlignment;
-} __packed;
-
-/* File System Control Information */
-struct smb2_fs_control_info {
- __le64 FreeSpaceStartFiltering;
- __le64 FreeSpaceThreshold;
- __le64 FreeSpaceStopFiltering;
- __le64 DefaultQuotaThreshold;
- __le64 DefaultQuotaLimit;
- __le32 FileSystemControlFlags;
- __le32 Padding;
-} __packed;
-
-/* partial list of QUERY INFO levels */
-#define FILE_DIRECTORY_INFORMATION 1
-#define FILE_FULL_DIRECTORY_INFORMATION 2
-#define FILE_BOTH_DIRECTORY_INFORMATION 3
-#define FILE_BASIC_INFORMATION 4
-#define FILE_STANDARD_INFORMATION 5
-#define FILE_INTERNAL_INFORMATION 6
-#define FILE_EA_INFORMATION 7
-#define FILE_ACCESS_INFORMATION 8
-#define FILE_NAME_INFORMATION 9
-#define FILE_RENAME_INFORMATION 10
-#define FILE_LINK_INFORMATION 11
-#define FILE_NAMES_INFORMATION 12
-#define FILE_DISPOSITION_INFORMATION 13
-#define FILE_POSITION_INFORMATION 14
-#define FILE_FULL_EA_INFORMATION 15
-#define FILE_MODE_INFORMATION 16
-#define FILE_ALIGNMENT_INFORMATION 17
-#define FILE_ALL_INFORMATION 18
-#define FILE_ALLOCATION_INFORMATION 19
-#define FILE_END_OF_FILE_INFORMATION 20
-#define FILE_ALTERNATE_NAME_INFORMATION 21
-#define FILE_STREAM_INFORMATION 22
-#define FILE_PIPE_INFORMATION 23
-#define FILE_PIPE_LOCAL_INFORMATION 24
-#define FILE_PIPE_REMOTE_INFORMATION 25
-#define FILE_MAILSLOT_QUERY_INFORMATION 26
-#define FILE_MAILSLOT_SET_INFORMATION 27
-#define FILE_COMPRESSION_INFORMATION 28
-#define FILE_OBJECT_ID_INFORMATION 29
-/* Number 30 not defined in documents */
-#define FILE_MOVE_CLUSTER_INFORMATION 31
-#define FILE_QUOTA_INFORMATION 32
-#define FILE_REPARSE_POINT_INFORMATION 33
-#define FILE_NETWORK_OPEN_INFORMATION 34
-#define FILE_ATTRIBUTE_TAG_INFORMATION 35
-#define FILE_TRACKING_INFORMATION 36
-#define FILEID_BOTH_DIRECTORY_INFORMATION 37
-#define FILEID_FULL_DIRECTORY_INFORMATION 38
-#define FILE_VALID_DATA_LENGTH_INFORMATION 39
-#define FILE_SHORT_NAME_INFORMATION 40
-#define FILE_SFIO_RESERVE_INFORMATION 44
-#define FILE_SFIO_VOLUME_INFORMATION 45
-#define FILE_HARD_LINK_INFORMATION 46
-#define FILE_NORMALIZED_NAME_INFORMATION 48
-#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
-#define FILE_STANDARD_LINK_INFORMATION 54
-
-#define OP_BREAK_STRUCT_SIZE_20 24
-#define OP_BREAK_STRUCT_SIZE_21 36
-
struct smb2_file_access_info {
__le32 AccessFlags;
} __packed;
@@ -1416,56 +310,6 @@ struct smb2_file_alignment_info {
__le32 AlignmentRequirement;
} __packed;
-struct smb2_file_internal_info {
- __le64 IndexNumber;
-} __packed; /* level 6 Query */
-
-struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[0]; /* New name to be assigned */
-} __packed; /* level 10 Set */
-
-struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[0]; /* Name to be assigned to new link */
-} __packed; /* level 11 Set */
-
-/*
- * This level 18, although with struct with same name is different from cifs
- * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
- * CurrentByteOffset.
- */
-struct smb2_file_all_info { /* data block encoding of response to level 18 */
- __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 Attributes;
- __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile; /* size ie offset to first free byte in file */
- __le32 NumberOfLinks; /* hard links */
- __u8 DeletePending;
- __u8 Directory;
- __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
- __le64 IndexNumber;
- __le32 EASize;
- __le32 AccessFlags;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
- __le32 FileNameLength;
- char FileName[1];
-} __packed; /* level 18 Query */
-
struct smb2_file_basic_info { /* data block encoding of response to level 18 */
__le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
__le64 LastAccessTime;
@@ -1477,7 +321,7 @@ struct smb2_file_basic_info { /* data block encoding of response to level 18 */
struct smb2_file_alt_name_info {
__le32 FileNameLength;
- char FileName[0];
+ char FileName[];
} __packed;
struct smb2_file_stream_info {
@@ -1485,13 +329,9 @@ struct smb2_file_stream_info {
__le32 StreamNameLength;
__le64 StreamSize;
__le64 StreamAllocationSize;
- char StreamName[0];
+ char StreamName[];
} __packed;
-struct smb2_file_eof_info { /* encoding of request for level 10 */
- __le64 EndOfFile; /* new end of file value */
-} __packed; /* level 20 Set */
-
struct smb2_file_ntwrk_info {
__le64 CreationTime;
__le64 LastAccessTime;
@@ -1528,7 +368,7 @@ struct smb2_file_pos_info {
__le64 CurrentByteOffset;
} __packed;
-#define FILE_MODE_INFO_MASK cpu_to_le32(0x0000103e)
+#define FILE_MODE_INFO_MASK cpu_to_le32(0x0000100e)
struct smb2_file_mode_info {
__le32 Mode;
@@ -1582,34 +422,6 @@ struct create_sd_buf_req {
struct smb_ntsd ntsd;
} __packed;
-/* Find File infolevels */
-#define SMB_FIND_FILE_POSIX_INFO 0x064
-
-/* Level 100 query info */
-struct smb311_posix_qinfo {
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 EndOfFile;
- __le64 AllocationSize;
- __le32 DosAttributes;
- __le64 Inode;
- __le32 DeviceId;
- __le32 Zero;
- /* beginning of POSIX Create Context Response */
- __le32 HardLinks;
- __le32 ReparseTag;
- __le32 Mode;
- u8 Sids[];
- /*
- * var sized owner SID
- * var sized group SID
- * le32 filenamelength
- * u8 filename[]
- */
-} __packed;
-
struct smb2_posix_info {
__le32 NextEntryOffset;
__u32 Ignored;
@@ -1647,6 +459,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn);
void init_smb2_max_read_size(unsigned int sz);
void init_smb2_max_write_size(unsigned int sz);
void init_smb2_max_trans_size(unsigned int sz);
+void init_smb2_max_credits(unsigned int sz);
bool is_smb2_neg_cmd(struct ksmbd_work *work);
bool is_smb2_rsp(struct ksmbd_work *work);
@@ -1705,4 +518,13 @@ int smb2_ioctl(struct ksmbd_work *work);
int smb2_oplock_break(struct ksmbd_work *work);
int smb2_notify(struct ksmbd_work *ksmbd_work);
+/*
+ * Get the body of the smb2 message excluding the 4 byte rfc1002 headers
+ * from request/response buffer.
+ */
+static inline void *smb2_get_msg(void *buf)
+{
+ return buf + 4;
+}
+
#endif /* _SMB2PDU_H */
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 707490ab1f4c..9a7e211dbf4f 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -132,7 +132,7 @@ int ksmbd_lookup_protocol_idx(char *str)
*/
int ksmbd_verify_smb_message(struct ksmbd_work *work)
{
- struct smb2_hdr *smb2_hdr = work->request_buf + work->next_smb2_rcv_hdr_off;
+ struct smb2_hdr *smb2_hdr = ksmbd_req_buf_next(work);
struct smb_hdr *hdr;
if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER)
@@ -239,14 +239,14 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count)
static int ksmbd_negotiate_smb_dialect(void *buf)
{
int smb_buf_length = get_rfc1002_len(buf);
- __le32 proto = ((struct smb2_hdr *)buf)->ProtocolId;
+ __le32 proto = ((struct smb2_hdr *)smb2_get_msg(buf))->ProtocolId;
if (proto == SMB2_PROTO_NUMBER) {
struct smb2_negotiate_req *req;
int smb2_neg_size =
- offsetof(struct smb2_negotiate_req, Dialects) - 4;
+ offsetof(struct smb2_negotiate_req, Dialects);
- req = (struct smb2_negotiate_req *)buf;
+ req = (struct smb2_negotiate_req *)smb2_get_msg(buf);
if (smb2_neg_size > smb_buf_length)
goto err_out;
@@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
for (i = 0; i < 2; i++) {
struct kstat kstat;
struct ksmbd_kstat ksmbd_kstat;
+ struct dentry *dentry;
if (!dir->dot_dotdot[i]) { /* fill dot entry info */
if (i == 0) {
d_info->name = ".";
d_info->name_len = 1;
+ dentry = dir->filp->f_path.dentry;
} else {
d_info->name = "..";
d_info->name_len = 2;
+ dentry = dir->filp->f_path.dentry->d_parent;
}
if (!match_pattern(d_info->name, d_info->name_len,
@@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
ksmbd_kstat.kstat = &kstat;
ksmbd_vfs_fill_dentry_attrs(work,
user_ns,
- dir->filp->f_path.dentry->d_parent,
+ dentry,
&ksmbd_kstat);
rc = fn(conn, info_level, d_info, &ksmbd_kstat);
if (rc)
@@ -445,11 +448,12 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command)
struct ksmbd_conn *conn = work->conn;
int ret;
- conn->dialect = ksmbd_negotiate_smb_dialect(work->request_buf);
+ conn->dialect =
+ ksmbd_negotiate_smb_dialect(work->request_buf);
ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect);
if (command == SMB2_NEGOTIATE_HE) {
- struct smb2_hdr *smb2_hdr = work->request_buf;
+ struct smb2_hdr *smb2_hdr = smb2_get_msg(work->request_buf);
if (smb2_hdr->ProtocolId != SMB2_PROTO_NUMBER) {
ksmbd_debug(SMB, "Downgrade to SMB1 negotiation\n");
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 6e79e7577f6b..e1369b4345a9 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -10,6 +10,7 @@
#include "glob.h"
#include "nterr.h"
+#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
/* ksmbd's Specific ERRNO */
@@ -32,17 +33,6 @@
#define SMB302_VERSION_STRING "3.02"
#define SMB311_VERSION_STRING "3.1.1"
-/* Dialects */
-#define SMB10_PROT_ID 0x00
-#define SMB20_PROT_ID 0x0202
-#define SMB21_PROT_ID 0x0210
-/* multi-protocol negotiate request */
-#define SMB2X_PROT_ID 0x02FF
-#define SMB30_PROT_ID 0x0300
-#define SMB302_PROT_ID 0x0302
-#define SMB311_PROT_ID 0x0311
-#define BAD_PROT_ID 0xFFFF
-
#define SMB_ECHO_INTERVAL (60 * HZ)
#define CIFS_DEFAULT_IOSIZE (64 * 1024)
@@ -59,21 +49,6 @@
/*
* File Attribute flags
*/
-#define ATTR_READONLY 0x0001
-#define ATTR_HIDDEN 0x0002
-#define ATTR_SYSTEM 0x0004
-#define ATTR_VOLUME 0x0008
-#define ATTR_DIRECTORY 0x0010
-#define ATTR_ARCHIVE 0x0020
-#define ATTR_DEVICE 0x0040
-#define ATTR_NORMAL 0x0080
-#define ATTR_TEMPORARY 0x0100
-#define ATTR_SPARSE 0x0200
-#define ATTR_REPARSE 0x0400
-#define ATTR_COMPRESSED 0x0800
-#define ATTR_OFFLINE 0x1000
-#define ATTR_NOT_CONTENT_INDEXED 0x2000
-#define ATTR_ENCRYPTED 0x4000
#define ATTR_POSIX_SEMANTICS 0x01000000
#define ATTR_BACKUP_SEMANTICS 0x02000000
#define ATTR_DELETE_ON_CLOSE 0x04000000
@@ -82,23 +57,6 @@
#define ATTR_NO_BUFFERING 0x20000000
#define ATTR_WRITE_THROUGH 0x80000000
-#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY)
-#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN)
-#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM)
-#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY)
-#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE)
-#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL)
-#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY)
-#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE)
-#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE)
-#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED)
-#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE)
-#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED)
-#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED)
-#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000)
-#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
-#define ATTR_MASK_LE cpu_to_le32(0x00007FB7)
-
/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */
#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */
@@ -160,11 +118,6 @@
/* file_execute, file_read_attributes*/
/* write_dac, and delete. */
-#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
-#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
- | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
-#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
-
#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
| FILE_READ_ATTRIBUTES \
| DELETE | READ_CONTROL | WRITE_DAC \
@@ -412,6 +365,7 @@ struct smb_version_values {
__u32 max_read_size;
__u32 max_write_size;
__u32 max_trans_size;
+ __u32 max_credits;
__u32 large_lock_type;
__u32 exclusive_lock_type;
__u32 shared_lock_type;
@@ -477,12 +431,6 @@ struct smb_version_cmds {
int (*proc)(struct ksmbd_work *swork);
};
-static inline size_t
-smb2_hdr_size_no_buflen(struct smb_version_values *vals)
-{
- return vals->header_size - 4;
-}
-
int ksmbd_min_protocol(void);
int ksmbd_max_protocol(void);
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index bd792db32623..6ecf55ea1fed 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/mnt_idmapping.h>
#include "smbacl.h"
#include "smb_common.h"
@@ -274,14 +275,7 @@ static int sid_to_id(struct user_namespace *user_ns,
uid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- /*
- * Translate raw sid into kuid in the server's user
- * namespace.
- */
- uid = make_kuid(&init_user_ns, id);
-
- /* If this is an idmapped mount, apply the idmapping. */
- uid = kuid_from_mnt(user_ns, uid);
+ uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id));
if (uid_valid(uid)) {
fattr->cf_uid = uid;
rc = 0;
@@ -291,14 +285,7 @@ static int sid_to_id(struct user_namespace *user_ns,
gid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- /*
- * Translate raw sid into kgid in the server's user
- * namespace.
- */
- gid = make_kgid(&init_user_ns, id);
-
- /* If this is an idmapped mount, apply the idmapping. */
- gid = kgid_from_mnt(user_ns, gid);
+ gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id));
if (gid_valid(gid)) {
fattr->cf_gid = gid;
rc = 0;
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index 73e08cad412b..811af3309429 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/posix_acl.h>
+#include <linux/mnt_idmapping.h>
#include "mgmt/tree_connect.h"
@@ -216,7 +217,7 @@ static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns,
kuid_t kuid;
/* If this is an idmapped mount, apply the idmapping. */
- kuid = kuid_into_mnt(mnt_userns, pace->e_uid);
+ kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid);
/* Translate the kuid into a userspace id ksmbd would see. */
return from_kuid(&init_user_ns, kuid);
@@ -228,7 +229,7 @@ static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns,
kgid_t kgid;
/* If this is an idmapped mount, apply the idmapping. */
- kgid = kgid_into_mnt(mnt_userns, pace->e_gid);
+ kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid);
/* Translate the kgid into a userspace id ksmbd would see. */
return from_kgid(&init_user_ns, kgid);
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 1acf1892a466..3ad6881e0f7e 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -301,6 +301,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
init_smb2_max_write_size(req->smb2_max_write);
if (req->smb2_max_trans)
init_smb2_max_trans_size(req->smb2_max_trans);
+ if (req->smb2_max_credits)
+ init_smb2_max_credits(req->smb2_max_credits);
ret = ksmbd_set_netbios_name(req->netbios_name);
ret |= ksmbd_set_server_string(req->server_string);
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index a2fd5a4d4cd5..e646d79554b8 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -34,7 +34,8 @@
#include "smbstatus.h"
#include "transport_rdma.h"
-#define SMB_DIRECT_PORT 5445
+#define SMB_DIRECT_PORT_IWARP 5445
+#define SMB_DIRECT_PORT_INFINIBAND 445
#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100)
@@ -60,6 +61,10 @@
* as defined in [MS-SMBD] 3.1.1.1
* Those may change after a SMB_DIRECT negotiation
*/
+
+/* Set 445 port to SMB Direct port by default */
+static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND;
+
/* The local peer's maximum number of credits to grant to the peer */
static int smb_direct_receive_credit_max = 255;
@@ -75,10 +80,18 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
static int smb_direct_max_receive_size = 8192;
-static int smb_direct_max_read_write_size = 1024 * 1024;
+static int smb_direct_max_read_write_size = 524224;
static int smb_direct_max_outstanding_rw_ops = 8;
+static LIST_HEAD(smb_direct_device_list);
+static DEFINE_RWLOCK(smb_direct_device_lock);
+
+struct smb_direct_device {
+ struct ib_device *ib_dev;
+ struct list_head list;
+};
+
static struct smb_direct_listener {
struct rdma_cm_id *cm_id;
} smb_direct_listener;
@@ -198,7 +211,7 @@ struct smb_direct_rdma_rw_msg {
struct completion *completion;
struct rdma_rw_ctx rw_ctx;
struct sg_table sgt;
- struct scatterlist sg_list[0];
+ struct scatterlist sg_list[];
};
static inline int get_buf_page_count(void *buf, int size)
@@ -415,6 +428,7 @@ static void free_transport(struct smb_direct_transport *t)
if (t->qp) {
ib_drain_qp(t->qp);
+ ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
ib_destroy_qp(t->qp);
}
@@ -484,7 +498,7 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
struct smb_direct_data_transfer *req =
(struct smb_direct_data_transfer *)recvmsg->packet;
struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet
- + le32_to_cpu(req->data_offset) - 4);
+ + le32_to_cpu(req->data_offset));
ksmbd_debug(RDMA,
"CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n",
le16_to_cpu(req->credits_granted),
@@ -555,6 +569,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
t->negotiation_requested = true;
t->full_packet_received = true;
+ enqueue_reassembly(t, recvmsg, 0);
wake_up_interruptible(&t->wait_status);
break;
case SMB_DIRECT_MSG_DATA_TRANSFER: {
@@ -1438,6 +1453,15 @@ static void smb_direct_disconnect(struct ksmbd_transport *t)
free_transport(st);
}
+static void smb_direct_shutdown(struct ksmbd_transport *t)
+{
+ struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+
+ ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id);
+
+ smb_direct_disconnect_rdma_work(&st->disconnect_work);
+}
+
static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -1581,19 +1605,13 @@ static int smb_direct_accept_client(struct smb_direct_transport *t)
pr_err("error at rdma_accept: %d\n", ret);
return ret;
}
-
- wait_event_interruptible(t->wait_status,
- t->status != SMB_DIRECT_CS_NEW);
- if (t->status != SMB_DIRECT_CS_CONNECTED)
- return -ENOTCONN;
return 0;
}
-static int smb_direct_negotiate(struct smb_direct_transport *t)
+static int smb_direct_prepare_negotiation(struct smb_direct_transport *t)
{
int ret;
struct smb_direct_recvmsg *recvmsg;
- struct smb_direct_negotiate_req *req;
recvmsg = get_free_recvmsg(t);
if (!recvmsg)
@@ -1603,44 +1621,20 @@ static int smb_direct_negotiate(struct smb_direct_transport *t)
ret = smb_direct_post_recv(t, recvmsg);
if (ret) {
pr_err("Can't post recv: %d\n", ret);
- goto out;
+ goto out_err;
}
t->negotiation_requested = false;
ret = smb_direct_accept_client(t);
if (ret) {
pr_err("Can't accept client\n");
- goto out;
+ goto out_err;
}
smb_direct_post_recv_credits(&t->post_recv_credits_work.work);
-
- ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
- ret = wait_event_interruptible_timeout(t->wait_status,
- t->negotiation_requested ||
- t->status == SMB_DIRECT_CS_DISCONNECTED,
- SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ);
- if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) {
- ret = ret < 0 ? ret : -ETIMEDOUT;
- goto out;
- }
-
- ret = smb_direct_check_recvmsg(recvmsg);
- if (ret == -ECONNABORTED)
- goto out;
-
- req = (struct smb_direct_negotiate_req *)recvmsg->packet;
- t->max_recv_size = min_t(int, t->max_recv_size,
- le32_to_cpu(req->preferred_send_size));
- t->max_send_size = min_t(int, t->max_send_size,
- le32_to_cpu(req->max_receive_size));
- t->max_fragmented_send_size =
- le32_to_cpu(req->max_fragmented_size);
-
- ret = smb_direct_send_negotiate_response(t, ret);
-out:
- if (recvmsg)
- put_recvmsg(t, recvmsg);
+ return 0;
+out_err:
+ put_recvmsg(t, recvmsg);
return ret;
}
@@ -1724,7 +1718,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
- cap->max_rdma_ctxs = 0;
+ cap->max_rdma_ctxs =
+ rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) *
+ smb_direct_max_outstanding_rw_ops;
return 0;
}
@@ -1806,6 +1802,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
{
int ret;
struct ib_qp_init_attr qp_attr;
+ int pages_per_rw;
t->pd = ib_alloc_pd(t->cm_id->device, 0);
if (IS_ERR(t->pd)) {
@@ -1853,6 +1850,23 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
t->qp = t->cm_id->qp;
t->cm_id->event_handler = smb_direct_cm_handler;
+ pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
+ if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
+ int pages_per_mr, mr_count;
+
+ pages_per_mr = min_t(int, pages_per_rw,
+ t->cm_id->device->attrs.max_fast_reg_page_list_len);
+ mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) *
+ atomic_read(&t->rw_avail_ops);
+ ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count,
+ IB_MR_TYPE_MEM_REG, pages_per_mr, 0);
+ if (ret) {
+ pr_err("failed to init mr pool count %d pages %d\n",
+ mr_count, pages_per_mr);
+ goto err;
+ }
+ }
+
return 0;
err:
if (t->qp) {
@@ -1877,6 +1891,49 @@ err:
static int smb_direct_prepare(struct ksmbd_transport *t)
{
struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ struct smb_direct_recvmsg *recvmsg;
+ struct smb_direct_negotiate_req *req;
+ int ret;
+
+ ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
+ ret = wait_event_interruptible_timeout(st->wait_status,
+ st->negotiation_requested ||
+ st->status == SMB_DIRECT_CS_DISCONNECTED,
+ SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ);
+ if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED)
+ return ret < 0 ? ret : -ETIMEDOUT;
+
+ recvmsg = get_first_reassembly(st);
+ if (!recvmsg)
+ return -ECONNABORTED;
+
+ ret = smb_direct_check_recvmsg(recvmsg);
+ if (ret == -ECONNABORTED)
+ goto out;
+
+ req = (struct smb_direct_negotiate_req *)recvmsg->packet;
+ st->max_recv_size = min_t(int, st->max_recv_size,
+ le32_to_cpu(req->preferred_send_size));
+ st->max_send_size = min_t(int, st->max_send_size,
+ le32_to_cpu(req->max_receive_size));
+ st->max_fragmented_send_size =
+ le32_to_cpu(req->max_fragmented_size);
+ st->max_fragmented_recv_size =
+ (st->recv_credit_max * st->max_recv_size) / 2;
+
+ ret = smb_direct_send_negotiate_response(st, ret);
+out:
+ spin_lock_irq(&st->reassembly_queue_lock);
+ st->reassembly_queue_length--;
+ list_del(&recvmsg->list);
+ spin_unlock_irq(&st->reassembly_queue_lock);
+ put_recvmsg(st, recvmsg);
+
+ return ret;
+}
+
+static int smb_direct_connect(struct smb_direct_transport *st)
+{
int ret;
struct ib_qp_cap qp_cap;
@@ -1898,13 +1955,11 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
return ret;
}
- ret = smb_direct_negotiate(st);
+ ret = smb_direct_prepare_negotiation(st);
if (ret) {
pr_err("Can't negotiate: %d\n", ret);
return ret;
}
-
- st->status = SMB_DIRECT_CS_CONNECTED;
return 0;
}
@@ -1920,6 +1975,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
{
struct smb_direct_transport *t;
+ int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
ksmbd_debug(RDMA,
@@ -1932,18 +1988,23 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
if (!t)
return -ENOMEM;
+ ret = smb_direct_connect(t);
+ if (ret)
+ goto out_err;
+
KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- SMB_DIRECT_PORT);
+ smb_direct_port);
if (IS_ERR(KSMBD_TRANS(t)->handler)) {
- int ret = PTR_ERR(KSMBD_TRANS(t)->handler);
-
+ ret = PTR_ERR(KSMBD_TRANS(t)->handler);
pr_err("Can't start thread\n");
- free_transport(t);
- return ret;
+ goto out_err;
}
return 0;
+out_err:
+ free_transport(t);
+ return ret;
}
static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
@@ -2007,12 +2068,65 @@ err:
return ret;
}
+static int smb_direct_ib_client_add(struct ib_device *ib_dev)
+{
+ struct smb_direct_device *smb_dev;
+
+ /* Set 5445 port if device type is iWARP(No IB) */
+ if (ib_dev->node_type != RDMA_NODE_IB_CA)
+ smb_direct_port = SMB_DIRECT_PORT_IWARP;
+
+ if (!ib_dev->ops.get_netdev ||
+ !rdma_frwr_is_supported(&ib_dev->attrs))
+ return 0;
+
+ smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL);
+ if (!smb_dev)
+ return -ENOMEM;
+ smb_dev->ib_dev = ib_dev;
+
+ write_lock(&smb_direct_device_lock);
+ list_add(&smb_dev->list, &smb_direct_device_list);
+ write_unlock(&smb_direct_device_lock);
+
+ ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name);
+ return 0;
+}
+
+static void smb_direct_ib_client_remove(struct ib_device *ib_dev,
+ void *client_data)
+{
+ struct smb_direct_device *smb_dev, *tmp;
+
+ write_lock(&smb_direct_device_lock);
+ list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) {
+ if (smb_dev->ib_dev == ib_dev) {
+ list_del(&smb_dev->list);
+ kfree(smb_dev);
+ break;
+ }
+ }
+ write_unlock(&smb_direct_device_lock);
+}
+
+static struct ib_client smb_direct_ib_client = {
+ .name = "ksmbd_smb_direct_ib",
+ .add = smb_direct_ib_client_add,
+ .remove = smb_direct_ib_client_remove,
+};
+
int ksmbd_rdma_init(void)
{
int ret;
smb_direct_listener.cm_id = NULL;
+ ret = ib_register_client(&smb_direct_ib_client);
+ if (ret) {
+ pr_err("failed to ib_register_client\n");
+ return ret;
+ }
+
/* When a client is running out of send credits, the credits are
* granted by the server's sending a packet using this queue.
* This avoids the situation that a clients cannot send packets
@@ -2023,7 +2137,7 @@ int ksmbd_rdma_init(void)
if (!smb_direct_wq)
return -ENOMEM;
- ret = smb_direct_listen(SMB_DIRECT_PORT);
+ ret = smb_direct_listen(smb_direct_port);
if (ret) {
destroy_workqueue(smb_direct_wq);
smb_direct_wq = NULL;
@@ -2036,37 +2150,67 @@ int ksmbd_rdma_init(void)
return 0;
}
-int ksmbd_rdma_destroy(void)
+void ksmbd_rdma_destroy(void)
{
- if (smb_direct_listener.cm_id)
- rdma_destroy_id(smb_direct_listener.cm_id);
+ if (!smb_direct_listener.cm_id)
+ return;
+
+ ib_unregister_client(&smb_direct_ib_client);
+ rdma_destroy_id(smb_direct_listener.cm_id);
+
smb_direct_listener.cm_id = NULL;
if (smb_direct_wq) {
- flush_workqueue(smb_direct_wq);
destroy_workqueue(smb_direct_wq);
smb_direct_wq = NULL;
}
- return 0;
}
bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
{
- struct ib_device *ibdev;
+ struct smb_direct_device *smb_dev;
+ int i;
bool rdma_capable = false;
- ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
- if (ibdev) {
- if (rdma_frwr_is_supported(&ibdev->attrs))
- rdma_capable = true;
- ib_device_put(ibdev);
+ read_lock(&smb_direct_device_lock);
+ list_for_each_entry(smb_dev, &smb_direct_device_list, list) {
+ for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) {
+ struct net_device *ndev;
+
+ ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev,
+ i + 1);
+ if (!ndev)
+ continue;
+
+ if (ndev == netdev) {
+ dev_put(ndev);
+ rdma_capable = true;
+ goto out;
+ }
+ dev_put(ndev);
+ }
+ }
+out:
+ read_unlock(&smb_direct_device_lock);
+
+ if (rdma_capable == false) {
+ struct ib_device *ibdev;
+
+ ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
+ if (ibdev) {
+ if (rdma_frwr_is_supported(&ibdev->attrs))
+ rdma_capable = true;
+ ib_device_put(ibdev);
+ }
}
+
return rdma_capable;
}
static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
.prepare = smb_direct_prepare,
.disconnect = smb_direct_disconnect,
+ .shutdown = smb_direct_shutdown,
.writev = smb_direct_writev,
.read = smb_direct_read,
.rdma_read = smb_direct_rdma_read,
diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
index 0fa8adc0776f..5567d93a6f96 100644
--- a/fs/ksmbd/transport_rdma.h
+++ b/fs/ksmbd/transport_rdma.h
@@ -7,8 +7,6 @@
#ifndef __KSMBD_TRANSPORT_RDMA_H__
#define __KSMBD_TRANSPORT_RDMA_H__
-#define SMB_DIRECT_PORT 5445
-
/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
struct smb_direct_negotiate_req {
__le16 min_version;
@@ -52,7 +50,7 @@ struct smb_direct_data_transfer {
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
-int ksmbd_rdma_destroy(void);
+void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index c14320e03b69..8fef9de787d3 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -404,7 +404,7 @@ static int create_socket(struct interface *iface)
&ksmbd_socket);
if (ret) {
pr_err("Can't create socket for ipv4: %d\n", ret);
- goto out_error;
+ goto out_clear;
}
sin.sin_family = PF_INET;
@@ -462,6 +462,7 @@ static int create_socket(struct interface *iface)
out_error:
tcp_destroy_socket(ksmbd_socket);
+out_clear:
iface->ksmbd_socket = NULL;
return ret;
}
@@ -475,7 +476,7 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event,
switch (event) {
case NETDEV_UP:
- if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ if (netif_is_bridge_port(netdev))
return NOTIFY_OK;
list_for_each_entry(iface, &iface_list, entry) {
@@ -584,7 +585,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
rtnl_lock();
for_each_netdev(&init_net, netdev) {
- if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ if (netif_is_bridge_port(netdev))
continue;
if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL)))
return -ENOMEM;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 835b384b0895..dcdd07c6efff 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -11,7 +11,6 @@
#include <linux/writeback.h>
#include <linux/xattr.h>
#include <linux/falloc.h>
-#include <linux/genhd.h>
#include <linux/fsnotify.h>
#include <linux/dcache.h>
#include <linux/slab.h>
@@ -399,8 +398,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
nbytes = kernel_read(filp, rbuf, count, pos);
if (nbytes < 0) {
- pr_err("smb read failed for (%s), err = %zd\n",
- fp->filename, nbytes);
+ pr_err("smb read failed, err = %zd\n", nbytes);
return nbytes;
}
@@ -876,8 +874,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
err = vfs_truncate(&filp->f_path, size);
if (err)
- pr_err("truncate failed for filename : %s err %d\n",
- fp->filename, err);
+ pr_err("truncate failed, err %d\n", err);
return err;
}
@@ -1013,7 +1010,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
loff_t off, loff_t len)
{
smb_break_all_levII_oplock(work, fp, 1);
- if (fp->f_ci->m_fattr & ATTR_SPARSE_FILE_LE)
+ if (fp->f_ci->m_fattr & FILE_ATTRIBUTE_SPARSE_FILE_LE)
return vfs_fallocate(fp->filp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
off, len);
@@ -1624,7 +1621,7 @@ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat)
time = ksmbd_UnixTimeToNT(kstat->ctime);
info->ChangeTime = cpu_to_le64(time);
- if (ksmbd_kstat->file_attributes & ATTR_DIRECTORY_LE) {
+ if (ksmbd_kstat->file_attributes & FILE_ATTRIBUTE_DIRECTORY_LE) {
info->EndOfFile = 0;
info->AllocationSize = 0;
} else {
@@ -1654,9 +1651,9 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
* or that acl is disable in server's filesystem and the config is yes.
*/
if (S_ISDIR(ksmbd_kstat->kstat->mode))
- ksmbd_kstat->file_attributes = ATTR_DIRECTORY_LE;
+ ksmbd_kstat->file_attributes = FILE_ATTRIBUTE_DIRECTORY_LE;
else
- ksmbd_kstat->file_attributes = ATTR_ARCHIVE_LE;
+ ksmbd_kstat->file_attributes = FILE_ATTRIBUTE_ARCHIVE_LE;
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) {
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index b0d5b8feb4a3..8c37aaf936ab 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -25,48 +25,9 @@ enum {
};
/* CreateOptions */
-/* Flag is set, it must not be a file , valid for directory only */
-#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
-#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
-#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
-
-/* Should not buffer on server*/
-#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
-/* MBZ */
-#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010)
-/* MBZ */
-#define FILE_SYNCHRONOUS_IO_NONALERT_LE cpu_to_le32(0x00000020)
-
-/* Flaf must not be set for directory */
-#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
-
-/* Should be zero */
#define CREATE_TREE_CONNECTION cpu_to_le32(0x00000080)
-#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
-#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
-#define FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400)
-
-/**
- * Doc says this is obsolete "open for recovery" flag should be zero
- * in any case.
- */
-#define CREATE_OPEN_FOR_RECOVERY cpu_to_le32(0x00000400)
-#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
-#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
-#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
-#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
-
-/* Should be zero*/
-#define FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000)
-#define FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000)
#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000)
-#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
-#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
-/* Should be zero */
-#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
-#define CREATE_OPTIONS_MASK cpu_to_le32(0x00FFFFFF)
#define CREATE_OPTION_READONLY 0x10000000
/* system. NB not sent over wire */
#define CREATE_OPTION_SPECIAL 0x20000000
@@ -86,6 +47,7 @@ struct ksmbd_dir_info {
int last_entry_offset;
bool hide_dot_file;
int flags;
+ int last_entry_off_align;
};
struct ksmbd_readdir_data {
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 29c1db66bd0f..c4d59d2735f0 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
kfree(smb_lock);
}
- kfree(fp->filename);
if (ksmbd_stream_fd(fp))
kfree(fp->stream.name);
kmem_cache_free(filp_cache, fp);
@@ -497,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
list_for_each_entry(lfp, &ci->m_fp_list, node) {
if (inode == file_inode(lfp->filp)) {
atomic_dec(&ci->m_count);
+ lfp = ksmbd_fp_get(lfp);
read_unlock(&ci->m_lock);
return lfp;
}
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
index 448576fbe4b7..fcb13413fa8d 100644
--- a/fs/ksmbd/vfs_cache.h
+++ b/fs/ksmbd/vfs_cache.h
@@ -62,7 +62,6 @@ struct ksmbd_inode {
struct ksmbd_file {
struct file *filp;
- char *filename;
u64 persistent_id;
u64 volatile_id;
@@ -96,16 +95,6 @@ struct ksmbd_file {
int durable_timeout;
- /* for SMB1 */
- int pid;
-
- /* conflict lock fail count for SMB1 */
- unsigned int cflock_cnt;
- /* last lock failure start offset for SMB1 */
- unsigned long long llock_fstart;
-
- int dirent_offset;
-
/* if ls is happening on directory, below is valid*/
struct ksmbd_readdir_data readdir_data;
int dot_dotdot[2];
diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h
index 8857c01093d9..16499ca5c82d 100644
--- a/fs/ksmbd/xattr.h
+++ b/fs/ksmbd/xattr.h
@@ -76,7 +76,7 @@ struct xattr_acl_entry {
struct xattr_smb_acl {
int count;
int next;
- struct xattr_acl_entry entries[0];
+ struct xattr_acl_entry entries[];
};
/* 64bytes hash in xattr_ntacl is computed with sha256 */
diff --git a/fs/libfs.c b/fs/libfs.c
index 51b4de3b3447..e64bdedef168 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -448,6 +448,30 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL(simple_rmdir);
+int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ bool old_is_dir = d_is_dir(old_dentry);
+ bool new_is_dir = d_is_dir(new_dentry);
+
+ if (old_dir != new_dir && old_is_dir != new_is_dir) {
+ if (old_is_dir) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ } else {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ d_inode(old_dentry)->i_ctime =
+ d_inode(new_dentry)->i_ctime = current_time(old_dir);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_rename_exchange);
+
int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
@@ -455,9 +479,12 @@ int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
+ if (flags & RENAME_EXCHANGE)
+ return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@ -604,7 +631,7 @@ const struct address_space_operations ram_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);
@@ -1171,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void noop_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * There is no page cache to invalidate in the dax case, however
- * we need this callback defined to prevent falling back to
- * block_invalidatepage() in do_invalidatepage().
- */
-}
-EXPORT_SYMBOL_GPL(noop_invalidatepage);
-
ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/*
@@ -1204,7 +1220,7 @@ EXPORT_SYMBOL(kfree_link);
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
struct inode *inode = new_inode_pseudo(s);
@@ -1352,7 +1368,7 @@ bool is_empty_dir_inode(struct inode *inode)
(inode->i_op == &empty_dir_inode_operations);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Determine if the name of a dentry should be casefolded.
*
@@ -1446,7 +1462,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
};
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
@@ -1481,10 +1497,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
#ifdef CONFIG_FS_ENCRYPTION
bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
bool needs_ci_ops = dentry->d_sb->s_encoding;
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
if (needs_encrypt_ops && needs_ci_ops) {
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
return;
@@ -1496,7 +1512,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
return;
}
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (needs_ci_ops) {
d_set_d_op(dentry, &generic_ci_dentry_ops);
return;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index b11f2afa84f1..99fffc9cb958 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -794,9 +794,6 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
goto retry_cancel;
}
- dprintk("lockd: cancel status %u (task %u)\n",
- status, task->tk_pid);
-
switch (status) {
case NLM_LCK_GRANTED:
case NLM_LCK_DENIED_GRACE_PERIOD:
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b632be3ad57b..59ef8a1f843f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -54,13 +54,9 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
-static struct task_struct *nlmsvc_task;
-static struct svc_rqst *nlmsvc_rqst;
+static struct svc_serv *nlmsvc_serv;
unsigned long nlmsvc_timeout;
-static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
-
unsigned int lockd_net_id;
/*
@@ -184,6 +180,10 @@ lockd(void *vrqstp)
nlm_shutdown_hosts();
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
+
+ dprintk("lockd_down: service stopped\n");
+
+ svc_exit_thread(rqstp);
return 0;
}
@@ -196,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
xprt = svc_find_xprt(serv, name, net, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, net, family, port,
- SVC_SOCK_DEFAULTS, cred);
+ return svc_xprt_create(serv, name, net, family, port,
+ SVC_SOCK_DEFAULTS, cred);
svc_xprt_put(xprt);
return 0;
}
@@ -247,7 +247,8 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
return err;
}
@@ -285,13 +286,12 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_shutdown_net(serv, net);
- dprintk("%s: per-net data destroyed; net=%x\n",
- __func__, net->ns.inum);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
}
} else {
- pr_err("%s: no users! task=%p, net=%x\n",
- __func__, nlmsvc_task, net->ns.inum);
+ pr_err("%s: no users! net=%x\n",
+ __func__, net->ns.inum);
BUG();
}
}
@@ -302,20 +302,16 @@ static int lockd_inetaddr_event(struct notifier_block *this,
struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
struct sockaddr_in sin;
- if ((event != NETDEV_DOWN) ||
- !atomic_inc_not_zero(&nlm_ntf_refcnt))
+ if (event != NETDEV_DOWN)
goto out;
- if (nlmsvc_rqst) {
+ if (nlmsvc_serv) {
dprintk("lockd_inetaddr_event: removed %pI4\n",
&ifa->ifa_local);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ifa->ifa_local;
- svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
- (struct sockaddr *)&sin);
+ svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin);
}
- atomic_dec(&nlm_ntf_refcnt);
- wake_up(&nlm_ntf_wq);
out:
return NOTIFY_DONE;
@@ -332,21 +328,17 @@ static int lockd_inet6addr_event(struct notifier_block *this,
struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
struct sockaddr_in6 sin6;
- if ((event != NETDEV_DOWN) ||
- !atomic_inc_not_zero(&nlm_ntf_refcnt))
+ if (event != NETDEV_DOWN)
goto out;
- if (nlmsvc_rqst) {
+ if (nlmsvc_serv) {
dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ifa->addr;
if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
sin6.sin6_scope_id = ifa->idev->dev->ifindex;
- svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
- (struct sockaddr *)&sin6);
+ svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6);
}
- atomic_dec(&nlm_ntf_refcnt);
- wake_up(&nlm_ntf_wq);
out:
return NOTIFY_DONE;
@@ -357,86 +349,15 @@ static struct notifier_block lockd_inet6addr_notifier = {
};
#endif
-static void lockd_unregister_notifiers(void)
-{
- unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
-#if IS_ENABLED(CONFIG_IPV6)
- unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
-#endif
- wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0);
-}
-
-static void lockd_svc_exit_thread(void)
-{
- atomic_dec(&nlm_ntf_refcnt);
- lockd_unregister_notifiers();
- svc_exit_thread(nlmsvc_rqst);
-}
-
-static int lockd_start_svc(struct svc_serv *serv)
+static int lockd_get(void)
{
+ struct svc_serv *serv;
int error;
- if (nlmsvc_rqst)
+ if (nlmsvc_serv) {
+ svc_get(nlmsvc_serv);
+ nlmsvc_users++;
return 0;
-
- /*
- * Create the kernel thread and wait for it to start.
- */
- nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- if (IS_ERR(nlmsvc_rqst)) {
- error = PTR_ERR(nlmsvc_rqst);
- printk(KERN_WARNING
- "lockd_up: svc_rqst allocation failed, error=%d\n",
- error);
- lockd_unregister_notifiers();
- goto out_rqst;
- }
-
- atomic_inc(&nlm_ntf_refcnt);
- svc_sock_update_bufs(serv);
- serv->sv_maxconn = nlm_max_connections;
-
- nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
- if (IS_ERR(nlmsvc_task)) {
- error = PTR_ERR(nlmsvc_task);
- printk(KERN_WARNING
- "lockd_up: kthread_run failed, error=%d\n", error);
- goto out_task;
- }
- nlmsvc_rqst->rq_task = nlmsvc_task;
- wake_up_process(nlmsvc_task);
-
- dprintk("lockd_up: service started\n");
- return 0;
-
-out_task:
- lockd_svc_exit_thread();
- nlmsvc_task = NULL;
-out_rqst:
- nlmsvc_rqst = NULL;
- return error;
-}
-
-static const struct svc_serv_ops lockd_sv_ops = {
- .svo_shutdown = svc_rpcb_cleanup,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
-};
-
-static struct svc_serv *lockd_create_svc(void)
-{
- struct svc_serv *serv;
-
- /*
- * Check whether we're already up and running.
- */
- if (nlmsvc_rqst) {
- /*
- * Note: increase service usage, because later in case of error
- * svc_destroy() will be called.
- */
- svc_get(nlmsvc_rqst->rq_server);
- return nlmsvc_rqst->rq_server;
}
/*
@@ -451,17 +372,44 @@ static struct svc_serv *lockd_create_svc(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
+
+ serv->sv_maxconn = nlm_max_connections;
+ error = svc_set_num_threads(serv, NULL, 1);
+ /* The thread now holds the only reference */
+ svc_put(serv);
+ if (error < 0)
+ return error;
+
+ nlmsvc_serv = serv;
register_inetaddr_notifier(&lockd_inetaddr_notifier);
#if IS_ENABLED(CONFIG_IPV6)
register_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
dprintk("lockd_up: service created\n");
- return serv;
+ nlmsvc_users++;
+ return 0;
+}
+
+static void lockd_put(void)
+{
+ if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n"))
+ return;
+ if (--nlmsvc_users)
+ return;
+
+ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+
+ svc_set_num_threads(nlmsvc_serv, NULL, 0);
+ nlmsvc_serv = NULL;
+ dprintk("lockd_down: service destroyed\n");
}
/*
@@ -469,36 +417,21 @@ static struct svc_serv *lockd_create_svc(void)
*/
int lockd_up(struct net *net, const struct cred *cred)
{
- struct svc_serv *serv;
int error;
mutex_lock(&nlmsvc_mutex);
- serv = lockd_create_svc();
- if (IS_ERR(serv)) {
- error = PTR_ERR(serv);
- goto err_create;
- }
+ error = lockd_get();
+ if (error)
+ goto err;
- error = lockd_up_net(serv, net, cred);
+ error = lockd_up_net(nlmsvc_serv, net, cred);
if (error < 0) {
- lockd_unregister_notifiers();
- goto err_put;
+ lockd_put();
+ goto err;
}
- error = lockd_start_svc(serv);
- if (error < 0) {
- lockd_down_net(serv, net);
- goto err_put;
- }
- nlmsvc_users++;
- /*
- * Note: svc_serv structures have an initial use count of 1,
- * so we exit through here on both success and failure.
- */
-err_put:
- svc_destroy(serv);
-err_create:
+err:
mutex_unlock(&nlmsvc_mutex);
return error;
}
@@ -511,27 +444,8 @@ void
lockd_down(struct net *net)
{
mutex_lock(&nlmsvc_mutex);
- lockd_down_net(nlmsvc_rqst->rq_server, net);
- if (nlmsvc_users) {
- if (--nlmsvc_users)
- goto out;
- } else {
- printk(KERN_ERR "lockd_down: no users! task=%p\n",
- nlmsvc_task);
- BUG();
- }
-
- if (!nlmsvc_task) {
- printk(KERN_ERR "lockd_down: no lockd running.\n");
- BUG();
- }
- kthread_stop(nlmsvc_task);
- dprintk("lockd_down: service stopped\n");
- lockd_svc_exit_thread();
- dprintk("lockd_down: service destroyed\n");
- nlmsvc_task = NULL;
- nlmsvc_rqst = NULL;
-out:
+ lockd_down_net(nlmsvc_serv, net);
+ lockd_put();
mutex_unlock(&nlmsvc_mutex);
}
EXPORT_SYMBOL_GPL(lockd_down);
@@ -780,11 +694,9 @@ module_exit(exit_nlm);
static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
const struct svc_procedure *procp = rqstp->rq_procinfo;
- struct kvec *argv = rqstp->rq_arg.head;
- struct kvec *resv = rqstp->rq_res.head;
svcxdr_init_decode(rqstp);
- if (!procp->pc_decode(rqstp, argv->iov_base))
+ if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
*statp = procp->pc_func(rqstp);
@@ -794,7 +706,7 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
return 1;
svcxdr_init_encode(rqstp);
- if (!procp->pc_encode(rqstp, resv->iov_base + resv->iov_len))
+ if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
return 1;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index e10ae2c41279..176b468a61c7 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -269,8 +269,6 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp)
*/
static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
{
- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
- -task->tk_status);
}
static void nlm4svc_callback_release(void *data)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e9b85d8fd5fe..cb3658ab9b7a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -470,8 +470,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
- struct nlm_block *block = NULL;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct inode *inode = nlmsvc_file_inode(file);
+#endif
+ struct nlm_block *block = NULL;
int error;
int mode;
int async_block = 0;
@@ -484,7 +486,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) {
+ if (nlmsvc_file_file(file)->f_op->lock) {
async_block = wait;
wait = 0;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 99696d3f6dd6..4dc1b40a489a 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -301,8 +301,6 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp)
*/
static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
{
- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
- -task->tk_status);
}
void nlmsvc_release_call(struct nlm_rqst *call)
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index cb3a7512c33e..0a22a2faf552 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -179,19 +179,21 @@ nlm_delete_file(struct nlm_file *file)
static int nlm_unlock_files(struct nlm_file *file)
{
struct file_lock lock;
- struct file *f;
+ locks_init_lock(&lock);
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- for (f = file->f_file[0]; f <= file->f_file[1]; f++) {
- if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) {
- pr_warn("lockd: unlock failure in %s:%d\n",
- __FILE__, __LINE__);
- return 1;
- }
- }
+ if (file->f_file[O_RDONLY] &&
+ vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
+ goto out_err;
+ if (file->f_file[O_WRONLY] &&
+ vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL))
+ goto out_err;
return 0;
+out_err:
+ pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__);
+ return 1;
}
/*
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9235e60b1769..2fb5748dae0c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -145,137 +145,131 @@ svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
* Decode Call arguments
*/
-int
-nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
argp->monitor = 1; /* monitor client by default */
- return 1;
+ return true;
}
-int
-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
argp->lock.fl.fl_type = F_UNLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_res *resp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_stats(xdr, &resp->status))
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_reboot *argp = rqstp->rq_argp;
+ __be32 *p;
u32 len;
if (xdr_stream_decode_u32(xdr, &len) < 0)
- return 0;
+ return false;
if (len > SM_MAXSTRLEN)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, len);
if (!p)
- return 0;
+ return false;
argp->len = len;
argp->mon = (char *)p;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
if (!p)
- return 0;
+ return false;
memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- return 1;
+ return true;
}
-int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
@@ -284,35 +278,34 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
lock->svid = ~(u32)0;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (!svcxdr_decode_fhandle(xdr, &lock->fh))
- return 0;
+ return false;
if (!svcxdr_decode_owner(xdr, &lock->oh))
- return 0;
+ return false;
/* XXX: Range checks are missing in the original code */
if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
@@ -320,45 +313,42 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
* Encode Reply results
*/
-int
-nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_testrply(xdr, resp);
}
-int
-nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_stats(xdr, resp->status);
}
-int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
if (!svcxdr_encode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_encode_stats(xdr, resp->status))
- return 0;
+ return false;
/* sequence */
if (xdr_stream_encode_u32(xdr, 0) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 98e957e4566c..856267c0864b 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -144,136 +144,131 @@ svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
* Decode Call arguments
*/
-int
-nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
argp->monitor = 1; /* monitor client by default */
- return 1;
+ return true;
}
-int
-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+
+ return true;
}
-int
-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
argp->lock.fl.fl_type = F_UNLCK;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_res *resp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_stats(xdr, &resp->status))
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_reboot *argp = rqstp->rq_argp;
+ __be32 *p;
u32 len;
if (xdr_stream_decode_u32(xdr, &len) < 0)
- return 0;
+ return false;
if (len > SM_MAXSTRLEN)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, len);
if (!p)
- return 0;
+ return false;
argp->len = len;
argp->mon = (char *)p;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
if (!p)
- return 0;
+ return false;
memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- return 1;
+ return true;
}
-int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
@@ -282,35 +277,34 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
lock->svid = ~(u32)0;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (!svcxdr_decode_fhandle(xdr, &lock->fh))
- return 0;
+ return false;
if (!svcxdr_decode_owner(xdr, &lock->oh))
- return 0;
+ return false;
/* XXX: Range checks are missing in the original code */
if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
@@ -318,45 +312,42 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
* Encode Reply results
*/
-int
-nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_testrply(xdr, resp);
}
-int
-nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_stats(xdr, resp->status);
}
-int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
if (!svcxdr_encode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_encode_stats(xdr, resp->status))
- return 0;
+ return false;
/* sequence */
if (xdr_stream_encode_u32(xdr, 0) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..8c6df10cd9ed 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
/*
* linux/fs/locks.c
*
- * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- * Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases. For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
*
- * Deadlock detection added.
- * FIXME: one thing isn't handled yet:
- * - mandatory locks (requires lots of changes elsewhere)
- * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- * Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- * Converted file_lock_table to a linked list from an array, which eliminates
- * the limits on how many active file locks are open.
- * Chad Page (pageone@netcom.com), November 27, 1994
- *
- * Removed dependency on file descriptors. dup()'ed file descriptors now
- * get the same locks as the original file descriptors, and a close() on
- * any file descriptor removes ALL the locks on the file for the current
- * process. Since locks still depend on the process id, locks are inherited
- * after an exec() but not after a fork(). This agrees with POSIX, and both
- * BSD and SVR4 practice.
- * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- * Scrapped free list which is redundant now that we allocate locks
- * dynamically with kmalloc()/kfree().
- * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- * Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- * FL_POSIX locks are created with calls to fcntl() and lockf() through the
- * fcntl() system call. They have the semantics described above.
- *
- * FL_FLOCK locks are created with calls to flock(), through the flock()
- * system call, which is new. Old C libraries implement flock() via fcntl()
- * and will continue to use the old, broken implementation.
- *
- * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- * with a file pointer (filp). As a result they can be shared by a parent
- * process and its children after a fork(). They are removed when the last
- * file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
- *
- * FL_FLOCK locks never deadlock, an existing lock is always removed before
- * upgrading from shared to exclusive (or vice versa). When this happens
- * any processes blocked by the current lock are woken up and allowed to
- * run before the new lock is applied.
- * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- * Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
- * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- * Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- * once we've checked for blocking and deadlocking.
- * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- * Initial implementation of mandatory locks. SunOS turned out to be
- * a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/filesystems/mandatory-locking.rst' for details.
- * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- * check if a file has mandatory locks, used by mmap(), open() and creat() to
- * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- * Manual, Section 2.
- * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- * Tidied up block list handling. Added '/proc/locks' interface.
- * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- * Fixed deadlock condition for pathological code that mixes calls to
- * flock() and fcntl().
- * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- * guarantee sensible behaviour in the case where file system modules might
- * be compiled with different options than the kernel itself.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- * (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- * locks. Changed process synchronisation to avoid dereferencing locks that
- * have already been freed.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- * Made the block list a circular list to minimise searching in the list.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- * Made mandatory locking a mount option. Default is not to allow mandatory
- * locking.
- * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- * Some adaptations for NFS support.
- * Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- * Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- * Use slab allocator instead of kmalloc/kfree.
- * Use generic list implementation from <linux/list.h>.
- * Sped up posix_locks_deadlock by only considering blocked locks.
- * Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- * Leases and LOCK_MAND
- * Matthew Wilcox <willy@debian.org>, June, 2000.
- * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
@@ -168,6 +62,7 @@
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
+#include <linux/sysctl.h>
#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
@@ -194,8 +89,37 @@ static int target_leasetype(struct file_lock *fl)
return fl->fl_type;
}
-int leases_enable = 1;
-int lease_break_time = 45;
+static int leases_enable = 1;
+static int lease_break_time = 45;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table locks_sysctls[] = {
+ {
+ .procname = "leases-enable",
+ .data = &leases_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_MMU
+ {
+ .procname = "lease-break-time",
+ .data = &lease_break_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_MMU */
+ {}
+};
+
+static int __init init_fs_locks_sysctls(void)
+{
+ register_sysctl_init("fs", locks_sysctls);
+ return 0;
+}
+early_initcall(init_fs_locks_sysctls);
+#endif /* CONFIG_SYSCTL */
/*
* The global file_lock_list is only used for displaying /proc/locks, so we
@@ -461,8 +385,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
}
static inline int flock_translate_cmd(int cmd) {
- if (cmd & LOCK_MAND)
- return cmd & (LOCK_MAND | LOCK_RW);
switch (cmd) {
case LOCK_SH:
return F_RDLCK;
@@ -942,8 +864,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
- if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return false;
return locks_conflict(caller_fl, sys_fl);
}
@@ -2116,11 +2036,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
- * - %LOCK_MAND -- a 'mandatory' flock.
- * This exists to emulate Windows Share Modes.
+ * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
*
- * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- * processes read and write access respectively.
+ * %LOCK_MAND support has been removed from the kernel.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
@@ -2137,9 +2055,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
- if (!unlock && !(cmd & LOCK_MAND) &&
- !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ goto out_putf;
+
+ /*
+ * LOCK_MAND locks were broken for a long time in that they never
+ * conflicted with one another and didn't prevent any sort of open,
+ * read or write activity.
+ *
+ * Just ignore these requests now, to preserve legacy behavior, but
+ * throw a warning to let people know that they don't actually work.
+ */
+ if (cmd & LOCK_MAND) {
+ pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ error = 0;
goto out_putf;
+ }
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
@@ -2718,6 +2649,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int type;
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
@@ -2745,11 +2677,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
- if (fl->fl_type & LOCK_MAND) {
- seq_puts(f, "FLOCK MSNFS ");
- } else {
- seq_puts(f, "FLOCK ADVISORY ");
- }
+ seq_puts(f, "FLOCK ADVISORY ");
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
@@ -2765,17 +2693,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "%s ",
- (fl->fl_type & LOCK_READ)
- ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
- : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
- } else {
- int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+ type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
- seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
- (type == F_RDLCK) ? "READ" : "UNLCK");
- }
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a71f1cf894b9..f1a6610e4ee6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
- ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -442,12 +442,14 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations minix_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = minix_readpage,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 334e7d09aa65..1fe56f8c495f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -29,7 +29,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/cleancache.h>
#include "internal.h"
/*
@@ -58,38 +57,14 @@ static void mpage_end_io(struct bio *bio)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
+static struct bio *mpage_bio_submit(struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
-static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs,
- gfp_t gfp_flags)
-{
- struct bio *bio;
-
- /* Restrict the given (page cache) mask for slab allocations */
- gfp_flags &= GFP_KERNEL;
- bio = bio_alloc(gfp_flags, nr_vecs);
-
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
- }
-
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = first_sector;
- }
- return bio;
-}
-
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
@@ -170,17 +145,14 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
- int op_flags;
+ int op = REQ_OP_READ;
unsigned nblocks;
unsigned relative_block;
- gfp_t gfp;
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
if (args->is_readahead) {
- op_flags = REQ_RAHEAD;
- gfp = readahead_gfp_mask(page->mapping);
- } else {
- op_flags = 0;
- gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ op |= REQ_RAHEAD;
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
}
if (page_has_buffers(page))
@@ -284,17 +256,11 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
SetPageMappedToDisk(page);
}
- if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
- cleancache_get_page(page) == 0) {
- SetPageUptodate(page);
- goto confused;
- }
-
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
alloc_new:
if (args->bio == NULL) {
@@ -303,15 +269,16 @@ alloc_new:
page))
goto out;
}
- args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- bio_max_segs(args->nr_pages), gfp);
+ args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op,
+ gfp);
if (args->bio == NULL)
goto confused;
+ args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
}
length = first_hole << blkbits;
if (bio_add_page(args->bio, page, length, 0) < length) {
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
goto alloc_new;
}
@@ -319,7 +286,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -327,7 +294,7 @@ out:
confused:
if (args->bio)
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
if (!PageUptodate(page))
block_read_full_page(page, args->get_block);
else
@@ -390,7 +357,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
put_page(page);
}
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
+ mpage_bio_submit(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
@@ -407,7 +374,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, 0, args.bio);
+ mpage_bio_submit(args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
@@ -498,7 +465,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -511,7 +477,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (!buffer_mapped(bh)) {
/*
* unmapped dirty buffers are created by
- * __set_page_dirty_buffers -> mmapped data
+ * block_dirty_folio -> mmapped data
*/
if (buffer_dirty(bh))
goto confused;
@@ -606,7 +572,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
alloc_new:
if (bio == NULL) {
@@ -615,13 +581,11 @@ alloc_new:
page, wbc))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
- if (bio == NULL)
- goto confused;
-
+ bio = bio_alloc(bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS);
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
wbc_init_bio(wbc, bio);
- bio->bi_write_hint = inode->i_write_hint;
}
/*
@@ -632,7 +596,7 @@ alloc_new:
wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
goto alloc_new;
}
@@ -642,7 +606,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -654,7 +618,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -710,11 +674,8 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
}
blk_finish_plug(&plug);
return ret;
@@ -731,11 +692,8 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 1946d9667790..509657fdf4f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1020,10 +1020,60 @@ static inline void put_link(struct nameidata *nd)
path_put(&last->link);
}
-int sysctl_protected_symlinks __read_mostly = 0;
-int sysctl_protected_hardlinks __read_mostly = 0;
-int sysctl_protected_fifos __read_mostly;
-int sysctl_protected_regular __read_mostly;
+static int sysctl_protected_symlinks __read_mostly;
+static int sysctl_protected_hardlinks __read_mostly;
+static int sysctl_protected_fifos __read_mostly;
+static int sysctl_protected_regular __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table namei_sysctls[] = {
+ {
+ .procname = "protected_symlinks",
+ .data = &sysctl_protected_symlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "protected_hardlinks",
+ .data = &sysctl_protected_hardlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "protected_fifos",
+ .data = &sysctl_protected_fifos,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "protected_regular",
+ .data = &sysctl_protected_regular,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+
+static int __init init_fs_namei_sysctls(void)
+{
+ register_sysctl_init("fs", namei_sysctls);
+ return 0;
+}
+fs_initcall(init_fs_namei_sysctls);
+
+#endif /* CONFIG_SYSCTL */
/**
* may_follow_link - Check symlink following for unsafe situations
@@ -3076,9 +3126,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
int error = get_write_access(inode);
if (error)
return error;
- /*
- * Refuse to truncate files with mandatory locks held on them.
- */
+
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
@@ -3625,18 +3673,14 @@ static struct dentry *filename_create(int dfd, struct filename *name,
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct qstr last;
+ bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
+ unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
+ unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
int type;
int err2;
int error;
- bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
-
- /*
- * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
- * other flags passed in are ignored!
- */
- lookup_flags &= LOOKUP_REVAL;
- error = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
if (error)
return ERR_PTR(error);
@@ -3650,11 +3694,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
/* don't fail immediately if it's r/o, at least try to report other errors */
err2 = mnt_want_write(path->mnt);
/*
- * Do the final lookup.
+ * Do the final lookup. Suppress 'create' if there is a trailing
+ * '/', and a directory wasn't requested.
*/
- lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ if (last.name[last.len] && !want_dir)
+ create_flags = 0;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path->dentry, lookup_flags);
+ dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3668,7 +3714,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
- if (unlikely(!is_dir && last.name[last.len])) {
+ if (unlikely(!create_flags)) {
error = -ENOENT;
goto fail;
}
@@ -3960,7 +4006,8 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
inode_lock(dentry->d_inode);
error = -EBUSY;
- if (is_local_mountpoint(dentry))
+ if (is_local_mountpoint(dentry) ||
+ (dentry->d_inode->i_flags & S_KERNEL_FILE))
goto out;
error = security_inode_rmdir(dir, dentry);
@@ -3975,13 +4022,12 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_rmdir(dir, dentry);
out:
inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_rmdir);
@@ -4103,7 +4149,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_unlink(dir, dentry);
}
}
}
@@ -4111,9 +4156,11 @@ out:
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
- if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ fsnotify_unlink(dir, dentry);
+ } else if (!error) {
fsnotify_link_count(target);
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
}
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 659a8f39c61a..afe2b64b14f1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -31,12 +31,13 @@
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
+#include <linux/mnt_idmapping.h>
#include "pnode.h"
#include "internal.h"
/* Maximum number of mounts in a mount namespace */
-unsigned int sysctl_mount_max __read_mostly = 100000;
+static unsigned int sysctl_mount_max __read_mostly = 100000;
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
@@ -343,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
- cpu_relax();
+ might_lock(&mount_lock.lock);
+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ cpu_relax();
+ } else {
+ /*
+ * This prevents priority inversion, if the task
+ * setting MNT_WRITE_HOLD got preempted on a remote
+ * CPU, and it prevents life lock if the task setting
+ * MNT_WRITE_HOLD has a lower priority and is bound to
+ * the same CPU as the task that is spinning here.
+ */
+ preempt_enable();
+ lock_mount_hash();
+ unlock_mount_hash();
+ preempt_disable();
+ }
+ }
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
@@ -468,6 +485,24 @@ void mnt_drop_write_file(struct file *file)
}
EXPORT_SYMBOL(mnt_drop_write_file);
+/**
+ * mnt_hold_writers - prevent write access to the given mount
+ * @mnt: mnt to prevent write access to
+ *
+ * Prevents write access to @mnt if there are no active writers for @mnt.
+ * This function needs to be called and return successfully before changing
+ * properties of @mnt that need to remain stable for callers with write access
+ * to @mnt.
+ *
+ * After this functions has been called successfully callers must pair it with
+ * a call to mnt_unhold_writers() in order to stop preventing write access to
+ * @mnt.
+ *
+ * Context: This function expects lock_mount_hash() to be held serializing
+ * setting MNT_WRITE_HOLD.
+ * Return: On success 0 is returned.
+ * On error, -EBUSY is returned.
+ */
static inline int mnt_hold_writers(struct mount *mnt)
{
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -499,6 +534,18 @@ static inline int mnt_hold_writers(struct mount *mnt)
return 0;
}
+/**
+ * mnt_unhold_writers - stop preventing write access to the given mount
+ * @mnt: mnt to stop preventing write access to
+ *
+ * Stop preventing write access to @mnt allowing callers to gain write access
+ * to @mnt again.
+ *
+ * This function can only be called after a successful call to
+ * mnt_hold_writers().
+ *
+ * Context: This function expects lock_mount_hash() to be held.
+ */
static inline void mnt_unhold_writers(struct mount *mnt)
{
/*
@@ -532,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb)
lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
- smp_mb();
- if (mnt_get_writers(mnt) > 0) {
- err = -EBUSY;
+ err = mnt_hold_writers(mnt);
+ if (err)
break;
- }
}
}
if (!err && atomic_long_read(&sb->s_remove_count))
@@ -561,7 +605,7 @@ static void free_vfsmnt(struct mount *mnt)
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(&mnt->mnt);
- if (mnt_userns != &init_user_ns)
+ if (!initial_idmapping(mnt_userns))
put_user_ns(mnt_userns);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
@@ -965,6 +1009,7 @@ static struct mount *skip_mnt_tree(struct mount *p)
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
+ struct user_namespace *fs_userns;
if (!fc->root)
return ERR_PTR(-EINVAL);
@@ -982,6 +1027,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
+ fs_userns = mnt->mnt.mnt_sb->s_user_ns;
+ if (!initial_idmapping(fs_userns))
+ mnt->mnt.mnt_userns = get_user_ns(fs_userns);
+
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
@@ -1072,7 +1121,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
atomic_inc(&sb->s_active);
mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
- if (mnt->mnt.mnt_userns != &init_user_ns)
+ if (!initial_idmapping(mnt->mnt.mnt_userns))
mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
@@ -2063,22 +2112,23 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
unsigned int max = READ_ONCE(sysctl_mount_max);
- unsigned int mounts = 0, old, pending, sum;
+ unsigned int mounts = 0;
struct mount *p;
+ if (ns->mounts >= max)
+ return -ENOSPC;
+ max -= ns->mounts;
+ if (ns->pending_mounts >= max)
+ return -ENOSPC;
+ max -= ns->pending_mounts;
+
for (p = mnt; p; p = next_mnt(p, mnt))
mounts++;
- old = ns->mounts;
- pending = ns->pending_mounts;
- sum = old + pending;
- if ((old > sum) ||
- (pending > sum) ||
- (max < sum) ||
- (mounts > (max - sum)))
+ if (mounts > max)
return -ENOSPC;
- ns->pending_mounts = pending + mounts;
+ ns->pending_mounts += mounts;
return 0;
}
@@ -2561,6 +2611,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
struct super_block *sb = mnt->mnt_sb;
if (!__mnt_is_readonly(mnt) &&
+ (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
char *buf = (char *)__get_free_page(GFP_KERNEL);
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
@@ -2575,6 +2626,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
+ sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
}
}
@@ -2870,7 +2922,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
* add a mount into a namespace's mount tree
*/
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
- struct path *path, int mnt_flags)
+ const struct path *path, int mnt_flags)
{
struct mount *parent = real_mount(path->mnt);
@@ -2993,7 +3045,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
return err;
}
-int finish_automount(struct vfsmount *m, struct path *path)
+int finish_automount(struct vfsmount *m, const struct path *path)
{
struct dentry *dentry = path->dentry;
struct mountpoint *mp;
@@ -3927,28 +3979,32 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
struct vfsmount *m = &mnt->mnt;
+ struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
if (!kattr->mnt_userns)
return 0;
/*
+ * Creating an idmapped mount with the filesystem wide idmapping
+ * doesn't make sense so block that. We don't allow mushy semantics.
+ */
+ if (kattr->mnt_userns == fs_userns)
+ return -EINVAL;
+
+ /*
* Once a mount has been idmapped we don't allow it to change its
* mapping. It makes things simpler and callers can just create
* another bind-mount they can idmap if they want to.
*/
- if (mnt_user_ns(m) != &init_user_ns)
+ if (is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
return -EINVAL;
- /* Don't yet support filesystem mountable in user namespaces. */
- if (m->mnt_sb->s_user_ns != &init_user_ns)
- return -EINVAL;
-
/* We're not controlling the superblock. */
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
return -EPERM;
/* Mount has already been visible in the filesystem hierarchy. */
@@ -3958,102 +4014,122 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return 0;
}
-static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
- struct mount *mnt, int *err)
+/**
+ * mnt_allow_writers() - check whether the attribute change allows writers
+ * @kattr: the new mount attributes
+ * @mnt: the mount to which @kattr will be applied
+ *
+ * Check whether thew new mount attributes in @kattr allow concurrent writers.
+ *
+ * Return: true if writers need to be held, false if not
+ */
+static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
+ const struct mount *mnt)
{
- struct mount *m = mnt, *last = NULL;
+ return !(kattr->attr_set & MNT_READONLY) ||
+ (mnt->mnt.mnt_flags & MNT_READONLY);
+}
- if (!is_mounted(&m->mnt)) {
- *err = -EINVAL;
- goto out;
- }
+static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
+{
+ struct mount *m;
+ int err;
- if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
- *err = -EINVAL;
- goto out;
- }
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
+ err = -EPERM;
+ break;
+ }
- do {
- unsigned int flags;
+ err = can_idmap_mount(kattr, m);
+ if (err)
+ break;
- flags = recalc_flags(kattr, m);
- if (!can_change_locked_flags(m, flags)) {
- *err = -EPERM;
- goto out;
+ if (!mnt_allow_writers(kattr, m)) {
+ err = mnt_hold_writers(m);
+ if (err)
+ break;
}
- *err = can_idmap_mount(kattr, m);
- if (*err)
- goto out;
+ if (!kattr->recurse)
+ return 0;
+ }
- last = m;
+ if (err) {
+ struct mount *p;
- if ((kattr->attr_set & MNT_READONLY) &&
- !(m->mnt.mnt_flags & MNT_READONLY)) {
- *err = mnt_hold_writers(m);
- if (*err)
- goto out;
+ /*
+ * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
+ * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
+ * mounts and needs to take care to include the first mount.
+ */
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ /* If we had to hold writers unblock them. */
+ if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
+ mnt_unhold_writers(p);
+
+ /*
+ * We're done once the first mount we changed got
+ * MNT_WRITE_HOLD unset.
+ */
+ if (p == m)
+ break;
}
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
-out:
- return last;
+ }
+ return err;
}
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
- struct user_namespace *mnt_userns;
+ struct user_namespace *mnt_userns, *old_mnt_userns;
if (!kattr->mnt_userns)
return;
+ /*
+ * We're the only ones able to change the mount's idmapping. So
+ * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
+ */
+ old_mnt_userns = mnt->mnt.mnt_userns;
+
mnt_userns = get_user_ns(kattr->mnt_userns);
/* Pairs with smp_load_acquire() in mnt_user_ns(). */
smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
+
+ /*
+ * If this is an idmapped filesystem drop the reference we've taken
+ * in vfs_create_mount() before.
+ */
+ if (!initial_idmapping(old_mnt_userns))
+ put_user_ns(old_mnt_userns);
}
-static void mount_setattr_commit(struct mount_kattr *kattr,
- struct mount *mnt, struct mount *last,
- int err)
+static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
- struct mount *m = mnt;
+ struct mount *m;
- do {
- if (!err) {
- unsigned int flags;
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ unsigned int flags;
- do_idmap_mount(kattr, m);
- flags = recalc_flags(kattr, m);
- WRITE_ONCE(m->mnt.mnt_flags, flags);
- }
+ do_idmap_mount(kattr, m);
+ flags = recalc_flags(kattr, m);
+ WRITE_ONCE(m->mnt.mnt_flags, flags);
- /*
- * We either set MNT_READONLY above so make it visible
- * before ~MNT_WRITE_HOLD or we failed to recursively
- * apply mount options.
- */
- if ((kattr->attr_set & MNT_READONLY) &&
- (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+ /* If we had to hold writers unblock them. */
+ if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt_unhold_writers(m);
- if (!err && kattr->propagation)
+ if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
-
- /*
- * On failure, only cleanup until we found the first mount
- * we failed to handle.
- */
- if (err && m == last)
+ if (!kattr->recurse)
break;
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
- if (!err)
- touch_mnt_namespace(mnt->mnt_ns);
+ }
+ touch_mnt_namespace(mnt->mnt_ns);
}
static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
- struct mount *mnt = real_mount(path->mnt), *last = NULL;
+ struct mount *mnt = real_mount(path->mnt);
int err = 0;
if (path->dentry != mnt->mnt.mnt_root)
@@ -4074,16 +4150,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
}
}
+ err = -EINVAL;
lock_mount_hash();
+ /* Ensure that this isn't anything purely vfs internal. */
+ if (!is_mounted(&mnt->mnt))
+ goto out;
+
/*
- * Get the mount tree in a shape where we can change mount
- * properties without failure.
+ * If this is an attached mount make sure it's located in the callers
+ * mount namespace. If it's not don't let the caller interact with it.
+ * If this is a detached mount make sure it has an anonymous mount
+ * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
*/
- last = mount_setattr_prepare(kattr, mnt, &err);
- if (last) /* Commit all changes or revert to the old state. */
- mount_setattr_commit(kattr, mnt, last, err);
+ if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ goto out;
+
+ /*
+ * First, we get the mount tree in a shape where we can change mount
+ * properties without failure. If we succeeded to do so we commit all
+ * changes and if we failed we clean up.
+ */
+ err = mount_setattr_prepare(kattr, mnt);
+ if (!err)
+ mount_setattr_commit(kattr, mnt);
+out:
unlock_mount_hash();
if (kattr->propagation) {
@@ -4133,13 +4225,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
/*
- * The init_user_ns is used to indicate that a vfsmount is not idmapped.
- * This is simpler than just having to treat NULL as unmapped. Users
- * wanting to idmap a mount to init_user_ns can just use a namespace
- * with an identity mapping.
+ * The initial idmapping cannot be used to create an idmapped
+ * mount. We use the initial idmapping as an indicator of a mount
+ * that is not idmapped. It can simply be passed into helpers that
+ * are aware of idmapped mounts as a convenient shortcut. A user
+ * can just create a dedicated identity mapping to achieve the same
+ * result.
*/
mnt_userns = container_of(ns, struct user_namespace, ns);
- if (mnt_userns == &init_user_ns) {
+ if (initial_idmapping(mnt_userns)) {
err = -EPERM;
goto out_fput;
}
@@ -4263,12 +4357,11 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
- if (err)
- return err;
-
- err = do_mount_setattr(&target, &kattr);
+ if (!err) {
+ err = do_mount_setattr(&target, &kattr);
+ path_put(&target);
+ }
finish_mount_kattr(&kattr);
- path_put(&target);
return err;
}
@@ -4596,3 +4689,25 @@ const struct proc_ns_operations mntns_operations = {
.install = mntns_install,
.owner = mntns_owner,
};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table fs_namespace_sysctls[] = {
+ {
+ .procname = "mount-max",
+ .data = &sysctl_mount_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static int __init init_fs_namespace_sysctls(void)
+{
+ register_sysctl_init("fs", fs_namespace_sysctls);
+ return 0;
+}
+fs_initcall(init_fs_namespace_sysctls);
+
+#endif /* CONFIG_SYSCTL */
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index c15bfc966d96..f684c0cd1ec5 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
-netfs-y := read_helper.o stats.o
+netfs-y := \
+ buffered_read.o \
+ io.o \
+ main.o \
+ objects.o
+
+netfs-$(CONFIG_NETFS_STATS) += stats.o
obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
new file mode 100644
index 000000000000..281a88a5b8dc
--- /dev/null
+++ b/fs/netfs/buffered_read.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level buffered read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * folios we're going to write back before we unlock them.
+ */
+void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos, account = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
+ __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ }
+ }
+
+ /* Walk through the pagecache and the I/O request lists simultaneously.
+ * We may have a mixture of cached and uncached sections and we only
+ * really want to write out the uncached sections. This is slightly
+ * complicated by the possibility that we might have huge pages with a
+ * mixture inside.
+ */
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ iopos = 0;
+ subreq_failed = (subreq->error < 0);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ folio_start_fscache(folio);
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ account += subreq->transferred;
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed) {
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio_index(folio) == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
+ _debug("no unlock");
+ else
+ folio_unlock(folio);
+ }
+ }
+ rcu_read_unlock();
+
+ task_io_account_read(account);
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+}
+
+static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
+ loff_t *_start, size_t *_len, loff_t i_size)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops && cres->ops->expand_readahead)
+ cres->ops->expand_readahead(cres, _start, _len, i_size);
+}
+
+static void netfs_rreq_expand(struct netfs_io_request *rreq,
+ struct readahead_control *ractl)
+{
+ /* Give the cache a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
+
+ /* Give the netfs a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ if (rreq->netfs_ops->expand_readahead)
+ rreq->netfs_ops->expand_readahead(rreq);
+
+ /* Expand the request if the cache wants it to start earlier. Note
+ * that the expansion may get further extended if the VM wishes to
+ * insert THPs and the preferred start and/or end wind up in the middle
+ * of THPs.
+ *
+ * If this is the case, however, the THP size should be an integer
+ * multiple of the cache granule size, so we get a whole number of
+ * granules to deal with.
+ */
+ if (rreq->start != readahead_pos(ractl) ||
+ rreq->len != readahead_length(ractl)) {
+ readahead_expand(ractl, rreq->start, rreq->len);
+ rreq->start = readahead_pos(ractl);
+ rreq->len = readahead_length(ractl);
+
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_expanded);
+ }
+}
+
+/**
+ * netfs_readahead - Helper to manage a read request
+ * @ractl: The description of the readahead request
+ *
+ * Fulfil a readahead request by drawing data from the cache if possible, or
+ * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
+ * requests from different sources will get munged together. If necessary, the
+ * readahead window can be expanded in either direction to a more convenient
+ * alighment for RPC efficiency or to make storage in the cache feasible.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+void netfs_readahead(struct readahead_control *ractl)
+{
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+ int ret;
+
+ _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+
+ if (readahead_count(ractl) == 0)
+ return;
+
+ rreq = netfs_alloc_request(ractl->mapping, ractl->file,
+ readahead_pos(ractl),
+ readahead_length(ractl),
+ NETFS_READAHEAD);
+ if (IS_ERR(rreq))
+ return;
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
+ }
+
+ netfs_stat(&netfs_n_rh_readahead);
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_readahead);
+
+ netfs_rreq_expand(rreq, ractl);
+
+ /* Drop the refs on the folios here rather than in the cache or
+ * filesystem. The locks will be dropped in netfs_rreq_unlock().
+ */
+ while (readahead_folio(ractl))
+ ;
+
+ netfs_begin_read(rreq, false);
+ return;
+
+cleanup_free:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ return;
+}
+EXPORT_SYMBOL(netfs_readahead);
+
+/**
+ * netfs_readpage - Helper to manage a readpage request
+ * @file: The file to read from
+ * @subpage: A subpage of the folio to read
+ *
+ * Fulfil a readpage request by drawing data from the cache if possible, or the
+ * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
+ * from different sources will get munged together.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_readpage(struct file *file, struct page *subpage)
+{
+ struct folio *folio = page_folio(subpage);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+ int ret;
+
+ _enter("%lx", folio_index(folio));
+
+ rreq = netfs_alloc_request(mapping, file,
+ folio_file_pos(folio), folio_size(folio),
+ NETFS_READPAGE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto alloc_error;
+ }
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
+ }
+
+ netfs_stat(&netfs_n_rh_readpage);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+ return netfs_begin_read(rreq, true);
+
+discard:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+alloc_error:
+ folio_unlock(folio);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_readpage);
+
+/*
+ * Prepare a folio for writing without reading first
+ * @folio: The folio being prepared
+ * @pos: starting position for the write
+ * @len: length of write
+ * @always_fill: T if the folio should always be completely filled/cleared
+ *
+ * In some cases, write_begin doesn't need to read at all:
+ * - full folio write
+ * - write that lies in a folio that is completely beyond EOF
+ * - write that covers the folio from start to EOF or beyond it
+ *
+ * If any of these criteria are met, then zero out the unwritten parts
+ * of the folio and return true. Otherwise, return false.
+ */
+static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
+ bool always_fill)
+{
+ struct inode *inode = folio_inode(folio);
+ loff_t i_size = i_size_read(inode);
+ size_t offset = offset_in_folio(folio, pos);
+ size_t plen = folio_size(folio);
+
+ if (unlikely(always_fill)) {
+ if (pos - offset + len <= i_size)
+ return false; /* Page entirely before EOF */
+ zero_user_segment(&folio->page, 0, plen);
+ folio_mark_uptodate(folio);
+ return true;
+ }
+
+ /* Full folio write */
+ if (offset == 0 && len >= plen)
+ return true;
+
+ /* Page entirely beyond the end of the file */
+ if (pos - offset >= i_size)
+ goto zero_out;
+
+ /* Write that covers from the start of the folio to EOF or beyond */
+ if (offset == 0 && (pos + len) >= i_size)
+ goto zero_out;
+
+ return false;
+zero_out:
+ zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ return true;
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write (may extend beyond the end of the folio chosen)
+ * @aop_flags: AOP_* flags
+ * @_folio: Where to put the resultant folio
+ * @_fsdata: Place for the netfs to store a cookie
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not. Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together. If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the folio is grabbed and locked. It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end. It is permitted to sleep. It should return 0 if the request
+ * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
+ * be regot; or return an error.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int aop_flags,
+ struct folio **_folio, void **_fsdata)
+{
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
+ struct folio *folio;
+ unsigned int fgp_flags;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ int ret;
+
+ DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
+
+retry:
+ fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+ if (aop_flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+ if (!folio)
+ return -ENOMEM;
+
+ if (ctx->ops->check_write_begin) {
+ /* Allow the netfs (eg. ceph) to flush conflicts. */
+ ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+ if (ret < 0) {
+ trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
+ if (ret == -EAGAIN)
+ goto retry;
+ goto error;
+ }
+ }
+
+ if (folio_test_uptodate(folio))
+ goto have_folio;
+
+ /* If the page is beyond the EOF, we want to clear it - unless it's
+ * within the cache granule containing the EOF, in which case we need
+ * to preload the granule.
+ */
+ if (!netfs_is_cache_enabled(ctx) &&
+ netfs_skip_folio_read(folio, pos, len, false)) {
+ netfs_stat(&netfs_n_rh_write_zskip);
+ goto have_folio_no_wait;
+ }
+
+ rreq = netfs_alloc_request(mapping, file,
+ folio_file_pos(folio), folio_size(folio),
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+ rreq->no_unlock_folio = folio_index(folio);
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+ }
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
+
+ /* Expand the request to meet caching requirements and download
+ * preferences.
+ */
+ ractl._nr_pages = folio_nr_pages(folio);
+ netfs_rreq_expand(rreq, &ractl);
+
+ /* We hold the folio locks, so we can drop the references */
+ folio_get(folio);
+ while (readahead_folio(&ractl))
+ ;
+
+ ret = netfs_begin_read(rreq, true);
+ if (ret < 0)
+ goto error;
+
+have_folio:
+ ret = folio_wait_fscache_killable(folio);
+ if (ret < 0)
+ goto error;
+have_folio_no_wait:
+ *_folio = folio;
+ _leave(" = 0");
+ return 0;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+error:
+ folio_unlock(folio);
+ folio_put(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b7f2c4459f33..b7b0e3d18d9e 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,6 +5,10 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/netfs.h>
+#include <linux/fscache.h>
+#include <trace/events/netfs.h>
+
#ifdef pr_fmt
#undef pr_fmt
#endif
@@ -12,11 +16,40 @@
#define pr_fmt(fmt) "netfs: " fmt
/*
- * read_helper.c
+ * buffered_read.c
+ */
+void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+
+/*
+ * io.c
+ */
+int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
+
+/*
+ * main.c
*/
extern unsigned int netfs_debug;
/*
+ * objects.c
+ */
+struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
+ struct file *file,
+ loff_t start, size_t len,
+ enum netfs_io_origin origin);
+void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
+void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async);
+void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
+ enum netfs_rreq_ref_trace what);
+struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
+
+static inline void netfs_see_request(struct netfs_io_request *rreq,
+ enum netfs_rreq_ref_trace what)
+{
+ trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
+}
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
@@ -55,6 +88,21 @@ static inline void netfs_stat_d(atomic_t *stat)
#define netfs_stat_d(x) do {} while(0)
#endif
+/*
+ * Miscellaneous functions.
+ */
+static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx)
+{
+#if IS_ENABLED(CONFIG_FSCACHE)
+ struct fscache_cookie *cookie = ctx->cache;
+
+ return fscache_cookie_valid(cookie) && cookie->cache_priv &&
+ fscache_cookie_enabled(cookie);
+#else
+ return false;
+#endif
+}
+
/*****************************************************************************/
/*
* debug tracing
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
new file mode 100644
index 000000000000..428925899282
--- /dev/null
+++ b/fs/netfs/io.c
@@ -0,0 +1,657 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
+{
+ struct iov_iter iter;
+
+ iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ iov_iter_zero(iov_iter_count(&iter), &iter);
+}
+
+static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+
+ netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ enum netfs_read_from_hole read_hole)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+
+ netfs_stat(&netfs_n_rh_read);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+
+ cres->ops->read(cres, subreq->start, &iter, read_hole,
+ netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Fill a subrequest region with zeroes.
+ */
+static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_zero);
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, 0, false);
+}
+
+/*
+ * Ask the netfs to issue a read request to the server for us.
+ *
+ * The netfs is expected to read from subreq->pos + subreq->transferred to
+ * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
+ * buffer prior to the transferred point as it might clobber dirty data
+ * obtained from the cache.
+ *
+ * Alternatively, the netfs is allowed to indicate one of two things:
+ *
+ * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
+ * make progress.
+ *
+ * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
+ * cleared.
+ */
+static void netfs_read_from_server(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_download);
+ rreq->netfs_ops->issue_read(subreq);
+}
+
+/*
+ * Release those waiting.
+ */
+static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, was_async);
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache. We have to clear
+ * the PG_fscache bits on the folios involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ pgoff_t unlocked = 0;
+ bool have_unlocked = false;
+
+ rcu_read_lock();
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
+
+ xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+ /* We might have multiple writes from the same huge
+ * folio, but we mustn't unlock a folio more than once.
+ */
+ if (have_unlocked && folio_index(folio) <= unlocked)
+ continue;
+ unlocked = folio_index(folio);
+ folio_end_fscache(folio);
+ have_unlocked = true;
+ }
+ }
+
+ rcu_read_unlock();
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ netfs_stat(&netfs_n_rh_write_failed);
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_copy_to_cache);
+ } else {
+ netfs_stat(&netfs_n_rh_write_done);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
+
+ /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_copy_ops))
+ netfs_rreq_unmark_after_write(rreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+
+/*
+ * Perform any outstanding writes to the cache. We inherit a ref from the
+ * caller.
+ */
+static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct netfs_io_subrequest *subreq, *next, *p;
+ struct iov_iter iter;
+ int ret;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
+
+ /* We don't want terminating writes trying to wake us up whilst we're
+ * still going through the list.
+ */
+ atomic_inc(&rreq->nr_copy_ops);
+
+ list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
+ if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ list_del_init(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false,
+ netfs_sreq_trace_put_no_copy);
+ }
+ }
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ /* Amalgamate adjacent writes */
+ while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ next = list_next_entry(subreq, rreq_link);
+ if (next->start != subreq->start + subreq->len)
+ break;
+ subreq->len += next->len;
+ list_del_init(&next->rreq_link);
+ netfs_put_subrequest(next, false,
+ netfs_sreq_trace_put_merged);
+ }
+
+ ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
+ rreq->i_size, true);
+ if (ret < 0) {
+ trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
+ continue;
+ }
+
+ iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ subreq->start, subreq->len);
+
+ atomic_inc(&rreq->nr_copy_ops);
+ netfs_stat(&netfs_n_rh_write);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write);
+ cres->ops->write(cres, subreq->start, &iter,
+ netfs_rreq_copy_terminated, subreq);
+ }
+
+ /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_copy_ops))
+ netfs_rreq_unmark_after_write(rreq, false);
+}
+
+static void netfs_rreq_write_to_cache_work(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+
+ netfs_rreq_do_write_to_cache(rreq);
+}
+
+static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
+{
+ rreq->work.func = netfs_rreq_write_to_cache_work;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+}
+
+/*
+ * Handle a short read.
+ */
+static void netfs_rreq_short_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
+
+ netfs_stat(&netfs_n_rh_short_read);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
+ atomic_inc(&rreq->nr_outstanding);
+ if (subreq->source == NETFS_READ_FROM_CACHE)
+ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
+ else
+ netfs_read_from_server(rreq, subreq);
+}
+
+/*
+ * Resubmit any short or failed operations. Returns true if we got the rreq
+ * ref back.
+ */
+static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ WARN_ON(in_interrupt());
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
+ /* We don't want terminating submissions trying to wake us up whilst
+ * we're still going through the list.
+ */
+ atomic_inc(&rreq->nr_outstanding);
+
+ __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error) {
+ if (subreq->source != NETFS_READ_FROM_CACHE)
+ break;
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->error = 0;
+ netfs_stat(&netfs_n_rh_download_instead);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ atomic_inc(&rreq->nr_outstanding);
+ netfs_read_from_server(rreq, subreq);
+ } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
+ netfs_rreq_short_read(rreq, subreq);
+ }
+ }
+
+ /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ return true;
+
+ wake_up_var(&rreq->nr_outstanding);
+ return false;
+}
+
+/*
+ * Check to see if the data read is still valid.
+ */
+static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ if (!rreq->netfs_ops->is_still_valid ||
+ rreq->netfs_ops->is_still_valid(rreq))
+ return;
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ subreq->error = -ESTALE;
+ __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ }
+ }
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we could be in an ordinary kernel thread, on a workqueue or in
+ * softirq context at this point. We inherit a ref from the caller.
+ */
+static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
+again:
+ netfs_rreq_is_still_valid(rreq);
+
+ if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
+ if (netfs_rreq_perform_resubmissions(rreq))
+ goto again;
+ return;
+ }
+
+ netfs_rreq_unlock_folios(rreq);
+
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
+ return netfs_rreq_write_to_cache(rreq);
+
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_work(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+ netfs_rreq_assess(rreq, false);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+static void netfs_rreq_terminated(struct netfs_io_request *rreq,
+ bool was_async)
+{
+ if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
+ was_async) {
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_rreq_assess(rreq, was_async);
+ }
+}
+
+/**
+ * netfs_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred, 0 to
+ * indicate a failure to transfer anything that should be retried or a negative
+ * error code. The helper will look after reissuing I/O operations as
+ * appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
+ ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ int u;
+
+ _enter("[%u]{%llx,%lx},%zd",
+ subreq->debug_index, subreq->start, subreq->flags,
+ transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_read);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq overread: R%x[%x] %zd > %zu - %zu",
+ rreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+complete:
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
+
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&rreq->nr_outstanding);
+ if (u == 0)
+ netfs_rreq_terminated(rreq, was_async);
+ else if (u == 1)
+ wake_up_var(&rreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+ netfs_clear_unread(subreq);
+ subreq->transferred = subreq->len;
+ goto complete;
+ }
+
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ goto out;
+
+failed:
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ rreq->error = subreq->error;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_subreq_terminated);
+
+static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops)
+ return cres->ops->prepare_read(subreq, i_size);
+ if (subreq->start >= rreq->i_size)
+ return NETFS_FILL_WITH_ZEROES;
+ return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Work out what sort of subrequest the next one will be.
+ */
+static enum netfs_io_source
+netfs_rreq_prepare_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ enum netfs_io_source source;
+
+ _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+
+ if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+ /* Call out to the netfs to let it shrink the request to fit
+ * its own I/O sizes and boundaries. If it shinks it here, it
+ * will be called again to make simultaneous calls; if it wants
+ * to make serial calls, it can indicate a short read and then
+ * we will call it again.
+ */
+ if (subreq->len > rreq->i_size - subreq->start)
+ subreq->len = rreq->i_size - subreq->start;
+
+ if (rreq->netfs_ops->clamp_length &&
+ !rreq->netfs_ops->clamp_length(subreq)) {
+ source = NETFS_INVALID_READ;
+ goto out;
+ }
+ }
+
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+
+out:
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ return source;
+}
+
+/*
+ * Slice off a piece of a read request and submit an I/O request for it.
+ */
+static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ unsigned int *_debug_index)
+{
+ struct netfs_io_subrequest *subreq;
+ enum netfs_io_source source;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq)
+ return false;
+
+ subreq->debug_index = (*_debug_index)++;
+ subreq->start = rreq->start + rreq->submitted;
+ subreq->len = rreq->len - rreq->submitted;
+
+ _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ /* Call out to the cache to find out what it can do with the remaining
+ * subset. It tells us in subreq->flags what it decided should be done
+ * and adjusts subreq->len down if the subset crosses a cache boundary.
+ *
+ * Then when we hand the subset, it can choose to take a subset of that
+ * (the starts must coincide), in which case, we go around the loop
+ * again and ask it to download the next piece.
+ */
+ source = netfs_rreq_prepare_read(rreq, subreq);
+ if (source == NETFS_INVALID_READ)
+ goto subreq_failed;
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ rreq->submitted += subreq->len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ switch (source) {
+ case NETFS_FILL_WITH_ZEROES:
+ netfs_fill_with_zeroes(rreq, subreq);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_read_from_server(rreq, subreq);
+ break;
+ case NETFS_READ_FROM_CACHE:
+ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+subreq_failed:
+ rreq->error = subreq->error;
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
+ return false;
+}
+
+/*
+ * Begin the process of reading in a chunk of data, where that data may be
+ * stitched together from multiple sources, including multiple servers and the
+ * local cache.
+ */
+int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
+{
+ unsigned int debug_index = 0;
+ int ret;
+
+ _enter("R=%x %llx-%llx",
+ rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
+
+ if (rreq->len == 0) {
+ pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
+ return -EIO;
+ }
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
+
+ if (sync)
+ netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+
+ /* Chop the read into slices according to what the cache and the netfs
+ * want and submit each one.
+ */
+ atomic_set(&rreq->nr_outstanding, 1);
+ do {
+ if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ break;
+
+ } while (rreq->submitted < rreq->len);
+
+ if (sync) {
+ /* Keep nr_outstanding incremented so that the ref always belongs to
+ * us, and the service code isn't punted off to a random thread pool to
+ * process.
+ */
+ for (;;) {
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ netfs_rreq_assess(rreq, false);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+ break;
+ cond_resched();
+ }
+
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
+ } else {
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_assess(rreq, false);
+ ret = 0;
+ }
+ return ret;
+}
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
new file mode 100644
index 000000000000..068568702957
--- /dev/null
+++ b/fs/netfs/main.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Miscellaneous bits for the netfs support library.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/netfs.h>
+
+MODULE_DESCRIPTION("Network fs support");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned netfs_debug;
+module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
new file mode 100644
index 000000000000..e86107b30ba4
--- /dev/null
+++ b/fs/netfs/objects.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Object lifetime handling and tracing.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Allocate an I/O request and initialise it.
+ */
+struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
+ struct file *file,
+ loff_t start, size_t len,
+ enum netfs_io_origin origin)
+{
+ static atomic_t debug_ids;
+ struct inode *inode = file ? file_inode(file) : mapping->host;
+ struct netfs_i_context *ctx = netfs_i_context(inode);
+ struct netfs_io_request *rreq;
+ int ret;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->origin = origin;
+ rreq->netfs_ops = ctx->ops;
+ rreq->mapping = mapping;
+ rreq->inode = inode;
+ rreq->i_size = i_size_read(inode);
+ rreq->debug_id = atomic_inc_return(&debug_ids);
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (rreq->netfs_ops->init_request) {
+ ret = rreq->netfs_ops->init_request(rreq, file);
+ if (ret < 0) {
+ kfree(rreq);
+ return ERR_PTR(ret);
+ }
+ }
+
+ netfs_stat(&netfs_n_rh_rreq);
+ return rreq;
+}
+
+void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what)
+{
+ int r;
+
+ __refcount_inc(&rreq->ref, &r);
+ trace_netfs_rreq_ref(rreq->debug_id, r + 1, what);
+}
+
+void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
+}
+
+static void netfs_free_request(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+
+ netfs_clear_subrequests(rreq, false);
+ if (rreq->netfs_priv)
+ rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+ netfs_stat_d(&netfs_n_rh_rreq);
+}
+
+void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
+ enum netfs_rreq_ref_trace what)
+{
+ unsigned int debug_id = rreq->debug_id;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
+ }
+}
+
+/*
+ * Allocate and partially initialise an I/O request structure.
+ */
+struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
+ }
+
+ return subreq;
+}
+
+void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
+ enum netfs_sreq_ref_trace what)
+{
+ int r;
+
+ __refcount_inc(&subreq->ref, &r);
+ trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1,
+ what);
+}
+
+static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ kfree(subreq);
+ netfs_stat_d(&netfs_n_rh_sreq);
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
+}
+
+void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
+ enum netfs_sreq_ref_trace what)
+{
+ unsigned int debug_index = subreq->debug_index;
+ unsigned int debug_id = subreq->rreq->debug_id;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&subreq->ref, &r);
+ trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what);
+ if (dead)
+ netfs_free_subrequest(subreq, was_async);
+}
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
deleted file mode 100644
index 994ec22d4040..000000000000
--- a/fs/netfs/read_helper.c
+++ /dev/null
@@ -1,1208 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Network filesystem high-level read support.
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uio.h>
-#include <linux/sched/mm.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/netfs.h>
-#include "internal.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/netfs.h>
-
-MODULE_DESCRIPTION("Network fs support");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned netfs_debug;
-module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
-
-static void netfs_rreq_work(struct work_struct *);
-static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
-
-static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
- bool was_async)
-{
- if (refcount_dec_and_test(&subreq->usage))
- __netfs_put_subrequest(subreq, was_async);
-}
-
-static struct netfs_read_request *netfs_alloc_read_request(
- const struct netfs_read_request_ops *ops, void *netfs_priv,
- struct file *file)
-{
- static atomic_t debug_ids;
- struct netfs_read_request *rreq;
-
- rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
- if (rreq) {
- rreq->netfs_ops = ops;
- rreq->netfs_priv = netfs_priv;
- rreq->inode = file_inode(file);
- rreq->i_size = i_size_read(rreq->inode);
- rreq->debug_id = atomic_inc_return(&debug_ids);
- INIT_LIST_HEAD(&rreq->subrequests);
- INIT_WORK(&rreq->work, netfs_rreq_work);
- refcount_set(&rreq->usage, 1);
- __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- ops->init_rreq(rreq, file);
- netfs_stat(&netfs_n_rh_rreq);
- }
-
- return rreq;
-}
-
-static void netfs_get_read_request(struct netfs_read_request *rreq)
-{
- refcount_inc(&rreq->usage);
-}
-
-static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq;
-
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_read_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, was_async);
- }
-}
-
-static void netfs_free_read_request(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
- netfs_rreq_clear_subreqs(rreq, false);
- if (rreq->netfs_priv)
- rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
- trace_netfs_rreq(rreq, netfs_rreq_trace_free);
- if (rreq->cache_resources.ops)
- rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
- netfs_stat_d(&netfs_n_rh_rreq);
-}
-
-static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
-{
- if (refcount_dec_and_test(&rreq->usage)) {
- if (was_async) {
- rreq->work.func = netfs_free_read_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_read_request(&rreq->work);
- }
- }
-}
-
-/*
- * Allocate and partially initialise an I/O request structure.
- */
-static struct netfs_read_subrequest *netfs_alloc_subrequest(
- struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
- if (subreq) {
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->usage, 2);
- subreq->rreq = rreq;
- netfs_get_read_request(rreq);
- netfs_stat(&netfs_n_rh_sreq);
- }
-
- return subreq;
-}
-
-static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
-{
- refcount_inc(&subreq->usage);
-}
-
-static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
- bool was_async)
-{
- struct netfs_read_request *rreq = subreq->rreq;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_free);
- kfree(subreq);
- netfs_stat_d(&netfs_n_rh_sreq);
- netfs_put_read_request(rreq, was_async);
-}
-
-/*
- * Clear the unread part of an I/O request.
- */
-static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
-{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
-}
-
-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq = priv;
-
- netfs_subreq_terminated(subreq, transferred_or_error, was_async);
-}
-
-/*
- * Issue a read against the cache.
- * - Eats the caller's ref on subreq.
- */
-static void netfs_read_from_cache(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq,
- bool seek_data)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
-
- netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, seek_data,
- netfs_cache_read_terminated, subreq);
-}
-
-/*
- * Fill a subrequest region with zeroes.
- */
-static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_zero);
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, 0, false);
-}
-
-/*
- * Ask the netfs to issue a read request to the server for us.
- *
- * The netfs is expected to read from subreq->pos + subreq->transferred to
- * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
- * buffer prior to the transferred point as it might clobber dirty data
- * obtained from the cache.
- *
- * Alternatively, the netfs is allowed to indicate one of two things:
- *
- * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
- * make progress.
- *
- * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
- * cleared.
- */
-static void netfs_read_from_server(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_download);
- rreq->netfs_ops->issue_op(subreq);
-}
-
-/*
- * Release those waiting.
- */
-static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_rreq_clear_subreqs(rreq, was_async);
- netfs_put_read_request(rreq, was_async);
-}
-
-/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the pages involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq;
- struct page *page;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- /* We might have multiple writes from the same huge
- * page, but we mustn't unlock a page more than once.
- */
- if (have_unlocked && page->index <= unlocked)
- continue;
- unlocked = page->index;
- end_page_fscache(page);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq = priv;
- struct netfs_read_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_wr_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_wr_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_read_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_write);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_wr_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_wr_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_read_subrequest(subreq);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_wr_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq,
- bool was_async)
-{
- if (was_async) {
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_rreq_do_write_to_cache(rreq);
- }
-}
-
-/*
- * Unlock the pages in a read operation. We need to set PG_fscache on any
- * pages we're going to write back before we unlock them.
- */
-static void netfs_rreq_unlock(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
- struct page *page;
- unsigned int iopos, account = 0;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- bool subreq_failed = false;
- int i;
-
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
- __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
- }
- }
-
- /* Walk through the pagecache and the I/O request lists simultaneously.
- * We may have a mixture of cached and uncached sections and we only
- * really want to write out the uncached sections. This is slightly
- * complicated by the possibility that we might have huge pages with a
- * mixture inside.
- */
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_read_subrequest, rreq_link);
- iopos = 0;
- subreq_failed = (subreq->error < 0);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
-
- rcu_read_lock();
- xas_for_each(&xas, page, last_page) {
- unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
- unsigned int pgend = pgpos + thp_size(page);
- bool pg_failed = false;
-
- for (;;) {
- if (!subreq) {
- pg_failed = true;
- break;
- }
- if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
- set_page_fscache(page);
- pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
- break;
-
- account += subreq->transferred;
- iopos += subreq->len;
- if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
- if (pgend == iopos)
- break;
- }
-
- if (!pg_failed) {
- for (i = 0; i < thp_nr_pages(page); i++)
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
-
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) {
- if (page->index == rreq->no_unlock_page &&
- test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags))
- _debug("no unlock");
- else
- unlock_page(page);
- }
- }
- rcu_read_unlock();
-
- task_io_account_read(account);
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
-}
-
-/*
- * Handle a short read.
- */
-static void netfs_rreq_short_read(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
- __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
-
- netfs_stat(&netfs_n_rh_short_read);
- trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
-
- netfs_get_read_subrequest(subreq);
- atomic_inc(&rreq->nr_rd_ops);
- if (subreq->source == NETFS_READ_FROM_CACHE)
- netfs_read_from_cache(rreq, subreq, true);
- else
- netfs_read_from_server(rreq, subreq);
-}
-
-/*
- * Resubmit any short or failed operations. Returns true if we got the rreq
- * ref back.
- */
-static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- WARN_ON(in_interrupt());
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
-
- /* We don't want terminating submissions trying to wake us up whilst
- * we're still going through the list.
- */
- atomic_inc(&rreq->nr_rd_ops);
-
- __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error) {
- if (subreq->source != NETFS_READ_FROM_CACHE)
- break;
- subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
- subreq->error = 0;
- netfs_stat(&netfs_n_rh_download_instead);
- trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
- netfs_get_read_subrequest(subreq);
- atomic_inc(&rreq->nr_rd_ops);
- netfs_read_from_server(rreq, subreq);
- } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
- netfs_rreq_short_read(rreq, subreq);
- }
- }
-
- /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_rd_ops))
- return true;
-
- wake_up_var(&rreq->nr_rd_ops);
- return false;
-}
-
-/*
- * Check to see if the data read is still valid.
- */
-static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- if (!rreq->netfs_ops->is_still_valid ||
- rreq->netfs_ops->is_still_valid(rreq))
- return;
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- subreq->error = -ESTALE;
- __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- }
- }
-}
-
-/*
- * Assess the state of a read request and decide what to do next.
- *
- * Note that we could be in an ordinary kernel thread, on a workqueue or in
- * softirq context at this point. We inherit a ref from the caller.
- */
-static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
-
-again:
- netfs_rreq_is_still_valid(rreq);
-
- if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
- test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
- if (netfs_rreq_perform_resubmissions(rreq))
- goto again;
- return;
- }
-
- netfs_rreq_unlock(rreq);
-
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq, was_async);
-
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_work(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
- netfs_rreq_assess(rreq, false);
-}
-
-/*
- * Handle the completion of all outstanding I/O operations on a read request.
- * We inherit a ref from the caller.
- */
-static void netfs_rreq_terminated(struct netfs_read_request *rreq,
- bool was_async)
-{
- if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
- was_async) {
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_rreq_assess(rreq, was_async);
- }
-}
-
-/**
- * netfs_subreq_terminated - Note the termination of an I/O operation.
- * @subreq: The I/O request that has terminated.
- * @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
- *
- * This tells the read helper that a contributory I/O operation has terminated,
- * one way or another, and that it should integrate the results.
- *
- * The caller indicates in @transferred_or_error the outcome of the operation,
- * supplying a positive value to indicate the number of bytes transferred, 0 to
- * indicate a failure to transfer anything that should be retried or a negative
- * error code. The helper will look after reissuing I/O operations as
- * appropriate and writing downloaded data to the cache.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- */
-void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
- ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_request *rreq = subreq->rreq;
- int u;
-
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_READ_FROM_CACHE:
- netfs_stat(&netfs_n_rh_read_done);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_stat(&netfs_n_rh_download_done);
- break;
- default:
- break;
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_read);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq overread: R%x[%x] %zd > %zu - %zu",
- rreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
-complete:
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
- set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
-
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
- u = atomic_dec_return(&rreq->nr_rd_ops);
- if (u == 0)
- netfs_rreq_terminated(rreq, was_async);
- else if (u == 1)
- wake_up_var(&rreq->nr_rd_ops);
-
- netfs_put_subrequest(subreq, was_async);
- return;
-
-incomplete:
- if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
- netfs_clear_unread(subreq);
- subreq->transferred = subreq->len;
- goto complete;
- }
-
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- goto out;
-
-failed:
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- netfs_stat(&netfs_n_rh_read_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- } else {
- netfs_stat(&netfs_n_rh_download_failed);
- set_bit(NETFS_RREQ_FAILED, &rreq->flags);
- rreq->error = subreq->error;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_subreq_terminated);
-
-static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
- loff_t i_size)
-{
- struct netfs_read_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops)
- return cres->ops->prepare_read(subreq, i_size);
- if (subreq->start >= rreq->i_size)
- return NETFS_FILL_WITH_ZEROES;
- return NETFS_DOWNLOAD_FROM_SERVER;
-}
-
-/*
- * Work out what sort of subrequest the next one will be.
- */
-static enum netfs_read_source
-netfs_rreq_prepare_read(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- enum netfs_read_source source;
-
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
-
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
-
- if (source == NETFS_DOWNLOAD_FROM_SERVER) {
- /* Call out to the netfs to let it shrink the request to fit
- * its own I/O sizes and boundaries. If it shinks it here, it
- * will be called again to make simultaneous calls; if it wants
- * to make serial calls, it can indicate a short read and then
- * we will call it again.
- */
- if (subreq->len > rreq->i_size - subreq->start)
- subreq->len = rreq->i_size - subreq->start;
-
- if (rreq->netfs_ops->clamp_length &&
- !rreq->netfs_ops->clamp_length(subreq)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
- }
-
- if (WARN_ON(subreq->len == 0))
- source = NETFS_INVALID_READ;
-
-out:
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- return source;
-}
-
-/*
- * Slice off a piece of a read request and submit an I/O request for it.
- */
-static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
- unsigned int *_debug_index)
-{
- struct netfs_read_subrequest *subreq;
- enum netfs_read_source source;
-
- subreq = netfs_alloc_subrequest(rreq);
- if (!subreq)
- return false;
-
- subreq->debug_index = (*_debug_index)++;
- subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
-
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- /* Call out to the cache to find out what it can do with the remaining
- * subset. It tells us in subreq->flags what it decided should be done
- * and adjusts subreq->len down if the subset crosses a cache boundary.
- *
- * Then when we hand the subset, it can choose to take a subset of that
- * (the starts must coincide), in which case, we go around the loop
- * again and ask it to download the next piece.
- */
- source = netfs_rreq_prepare_read(rreq, subreq);
- if (source == NETFS_INVALID_READ)
- goto subreq_failed;
-
- atomic_inc(&rreq->nr_rd_ops);
-
- rreq->submitted += subreq->len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- switch (source) {
- case NETFS_FILL_WITH_ZEROES:
- netfs_fill_with_zeroes(rreq, subreq);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_read_from_server(rreq, subreq);
- break;
- case NETFS_READ_FROM_CACHE:
- netfs_read_from_cache(rreq, subreq, false);
- break;
- default:
- BUG();
- }
-
- return true;
-
-subreq_failed:
- rreq->error = subreq->error;
- netfs_put_subrequest(subreq, false);
- return false;
-}
-
-static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops && cres->ops->expand_readahead)
- cres->ops->expand_readahead(cres, _start, _len, i_size);
-}
-
-static void netfs_rreq_expand(struct netfs_read_request *rreq,
- struct readahead_control *ractl)
-{
- /* Give the cache a chance to change the request parameters. The
- * resultant request must contain the original region.
- */
- netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
-
- /* Give the netfs a chance to change the request parameters. The
- * resultant request must contain the original region.
- */
- if (rreq->netfs_ops->expand_readahead)
- rreq->netfs_ops->expand_readahead(rreq);
-
- /* Expand the request if the cache wants it to start earlier. Note
- * that the expansion may get further extended if the VM wishes to
- * insert THPs and the preferred start and/or end wind up in the middle
- * of THPs.
- *
- * If this is the case, however, the THP size should be an integer
- * multiple of the cache granule size, so we get a whole number of
- * granules to deal with.
- */
- if (rreq->start != readahead_pos(ractl) ||
- rreq->len != readahead_length(ractl)) {
- readahead_expand(ractl, rreq->start, rreq->len);
- rreq->start = readahead_pos(ractl);
- rreq->len = readahead_length(ractl);
-
- trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
- netfs_read_trace_expanded);
- }
-}
-
-/**
- * netfs_readahead - Helper to manage a read request
- * @ractl: The description of the readahead request
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Fulfil a readahead request by drawing data from the cache if possible, or
- * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
- * requests from different sources will get munged together. If necessary, the
- * readahead window can be expanded in either direction to a more convenient
- * alighment for RPC efficiency or to make storage in the cache feasible.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory. It may also be passed a private token, which will
- * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
- *
- * This is usable whether or not caching is enabled.
- */
-void netfs_readahead(struct readahead_control *ractl,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- struct page *page;
- unsigned int debug_index = 0;
- int ret;
-
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
-
- if (readahead_count(ractl) == 0)
- goto cleanup;
-
- rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
- if (!rreq)
- goto cleanup;
- rreq->mapping = ractl->mapping;
- rreq->start = readahead_pos(ractl);
- rreq->len = readahead_length(ractl);
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
-
- netfs_stat(&netfs_n_rh_readahead);
- trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
- netfs_read_trace_readahead);
-
- netfs_rreq_expand(rreq, ractl);
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Drop the refs on the pages here rather than in the cache or
- * filesystem. The locks will be dropped in netfs_rreq_unlock().
- */
- while ((page = readahead_page(ractl)))
- put_page(page);
-
- /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_rd_ops))
- netfs_rreq_assess(rreq, false);
- return;
-
-cleanup_free:
- netfs_put_read_request(rreq, false);
- return;
-cleanup:
- if (netfs_priv)
- ops->cleanup(ractl->mapping, netfs_priv);
- return;
-}
-EXPORT_SYMBOL(netfs_readahead);
-
-/**
- * netfs_readpage - Helper to manage a readpage request
- * @file: The file to read from
- * @page: The page to read
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Fulfil a readpage request by drawing data from the cache if possible, or the
- * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
- * from different sources will get munged together.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory. It may also be passed a private token, which will
- * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_readpage(struct file *file,
- struct page *page,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- unsigned int debug_index = 0;
- int ret;
-
- _enter("%lx", page_index(page));
-
- rreq = netfs_alloc_read_request(ops, netfs_priv, file);
- if (!rreq) {
- if (netfs_priv)
- ops->cleanup(netfs_priv, page_file_mapping(page));
- unlock_page(page);
- return -ENOMEM;
- }
- rreq->mapping = page_file_mapping(page);
- rreq->start = page_file_offset(page);
- rreq->len = thp_size(page);
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
- unlock_page(page);
- goto out;
- }
- }
-
- netfs_stat(&netfs_n_rh_readpage);
- trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
-
- netfs_get_read_request(rreq);
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
- * the service code isn't punted off to a random thread pool to
- * process.
- */
- do {
- wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
- netfs_rreq_assess(rreq, false);
- } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage);
- ret = -EIO;
- }
-out:
- netfs_put_read_request(rreq, false);
- return ret;
-}
-EXPORT_SYMBOL(netfs_readpage);
-
-/**
- * netfs_skip_page_read - prep a page for writing without reading first
- * @page: page being prepared
- * @pos: starting position for the write
- * @len: length of write
- *
- * In some cases, write_begin doesn't need to read at all:
- * - full page write
- * - write that lies in a page that is completely beyond EOF
- * - write that covers the the page from start to EOF or beyond it
- *
- * If any of these criteria are met, then zero out the unwritten parts
- * of the page and return true. Otherwise, return false.
- */
-static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len)
-{
- struct inode *inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- size_t offset = offset_in_thp(page, pos);
-
- /* Full page write */
- if (offset == 0 && len >= thp_size(page))
- return true;
-
- /* pos beyond last page in the file */
- if (pos - offset >= i_size)
- goto zero_out;
-
- /* Write that covers from the start of the page to EOF or beyond */
- if (offset == 0 && (pos + len) >= i_size)
- goto zero_out;
-
- return false;
-zero_out:
- zero_user_segments(page, 0, offset, offset + len, thp_size(page));
- return true;
-}
-
-/**
- * netfs_write_begin - Helper to prepare for writing
- * @file: The file to read from
- * @mapping: The mapping to read from
- * @pos: File position at which the write will begin
- * @len: The length of the write (may extend beyond the end of the page chosen)
- * @flags: AOP_* flags
- * @_page: Where to put the resultant page
- * @_fsdata: Place for the netfs to store a cookie
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Pre-read data for a write-begin request by drawing data from the cache if
- * possible, or the netfs if not. Space beyond the EOF is zero-filled.
- * Multiple I/O requests from different sources will get munged together. If
- * necessary, the readahead window can be expanded in either direction to a
- * more convenient alighment for RPC efficiency or to make storage in the cache
- * feasible.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory.
- *
- * The check_write_begin() operation can be provided to check for and flush
- * conflicting writes once the page is grabbed and locked. It is passed a
- * pointer to the fsdata cookie that gets returned to the VM to be passed to
- * write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the page and return -EAGAIN to cause the page to be
- * regot; or return an error.
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
- struct page **_page, void **_fsdata,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- struct page *page, *xpage;
- struct inode *inode = file_inode(file);
- unsigned int debug_index = 0;
- pgoff_t index = pos >> PAGE_SHIFT;
- int ret;
-
- DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
-
-retry:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- if (ops->check_write_begin) {
- /* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ops->check_write_begin(file, pos, len, page, _fsdata);
- if (ret < 0) {
- trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
- if (ret == -EAGAIN)
- goto retry;
- goto error;
- }
- }
-
- if (PageUptodate(page))
- goto have_page;
-
- /* If the page is beyond the EOF, we want to clear it - unless it's
- * within the cache granule containing the EOF, in which case we need
- * to preload the granule.
- */
- if (!ops->is_cache_enabled(inode) &&
- netfs_skip_page_read(page, pos, len)) {
- netfs_stat(&netfs_n_rh_write_zskip);
- goto have_page_no_wait;
- }
-
- ret = -ENOMEM;
- rreq = netfs_alloc_read_request(ops, netfs_priv, file);
- if (!rreq)
- goto error;
- rreq->mapping = page->mapping;
- rreq->start = page_offset(page);
- rreq->len = thp_size(page);
- rreq->no_unlock_page = page->index;
- __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
- netfs_priv = NULL;
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
-
- netfs_stat(&netfs_n_rh_write_begin);
- trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
-
- /* Expand the request to meet caching requirements and download
- * preferences.
- */
- ractl._nr_pages = thp_nr_pages(page);
- netfs_rreq_expand(rreq, &ractl);
- netfs_get_read_request(rreq);
-
- /* We hold the page locks, so we can drop the references */
- while ((xpage = readahead_page(&ractl)))
- if (xpage != page)
- put_page(xpage);
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
- * the service code isn't punted off to a random thread pool to
- * process.
- */
- for (;;) {
- wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
- netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin);
- ret = -EIO;
- }
- netfs_put_read_request(rreq, false);
- if (ret < 0)
- goto error;
-
-have_page:
- ret = wait_on_page_fscache_killable(page);
- if (ret < 0)
- goto error;
-have_page_no_wait:
- if (netfs_priv)
- ops->cleanup(netfs_priv, mapping);
- *_page = page;
- _leave(" = 0");
- return 0;
-
-error_put:
- netfs_put_read_request(rreq, false);
-error:
- unlock_page(page);
- put_page(page);
- if (netfs_priv)
- ops->cleanup(netfs_priv, mapping);
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 9ae538c85378..5510a7a14a40 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/seq_file.h>
-#include <linux/netfs.h>
#include "internal.h"
atomic_t netfs_n_rh_readahead;
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 22d11fdc6deb..5f6db37f461e 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -12,7 +12,7 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
export.o sysfs.o fs_context.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o
obj-$(CONFIG_NFS_V2) += nfsv2.o
nfsv2-y := nfs2super.o proc.o nfs2xdr.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fe860c538747..79a8b451791f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(unsigned int npg,
- struct block_device *bdev, sector_t disk_sector,
- bio_end_io_t end_io, struct parallel_io *par)
-{
- struct bio *bio;
-
- npg = bio_max_segs(npg);
- bio = bio_alloc(GFP_NOIO, npg);
- if (bio) {
- bio->bi_iter.bi_sector = disk_sector;
- bio_set_dev(bio, bdev);
- bio->bi_end_io = end_io;
- bio->bi_private = par;
- }
- return bio;
-}
-
static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
{
return offset >= map->start && offset < map->start + map->len;
@@ -171,11 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, map->bdev,
- disk_addr >> SECTOR_SHIFT, end_io, par);
- if (!bio)
- return ERR_PTR(-ENOMEM);
- bio_set_op_attrs(bio, rw, 0);
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
+ bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
}
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(bio);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index acb1d22907da..5e56da748b2a 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return PTR_ERR(bdev);
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index ef9db135c649..6c977288cc28 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -27,7 +27,6 @@
*/
#include <linux/module.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "blocklayout.h"
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86d856de1389..456af7d230cf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,7 +17,6 @@
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/freezer.h>
-#include <linux/kthread.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
int ret;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret <= 0)
goto out_err;
nn->nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
@@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+
svc_exit_thread(rqstp);
- module_put_and_exit(0);
return 0;
}
@@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp)
finish_wait(&serv->sv_cb_waitq, &wq);
}
}
+
svc_exit_thread(rqstp);
- module_put_and_exit(0);
return 0;
}
@@ -169,12 +168,12 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
- if (serv->sv_nrthreads-1 == nrservs)
+ if (serv->sv_nrthreads == nrservs)
return 0;
- ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+ ret = svc_set_num_threads(serv, NULL, nrservs);
if (ret) {
- serv->sv_ops->svo_setup(serv, NULL, 0);
+ svc_set_num_threads(serv, NULL, 0);
return ret;
}
dprintk("nfs_callback_up: service started\n");
@@ -189,7 +188,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -232,59 +231,17 @@ err_bind:
return ret;
}
-static const struct svc_serv_ops nfs40_cb_sv_ops = {
- .svo_function = nfs4_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads_sync,
- .svo_module = THIS_MODULE,
-};
-#if defined(CONFIG_NFS_V4_1)
-static const struct svc_serv_ops nfs41_cb_sv_ops = {
- .svo_function = nfs41_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads_sync,
- .svo_module = THIS_MODULE,
-};
-
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = &nfs41_cb_sv_ops,
-};
-#else
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = NULL,
-};
-#endif
-
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- const struct svc_serv_ops *sv_ops;
+ int (*threadfn)(void *data);
struct svc_serv *serv;
/*
* Check whether we're already up and running.
*/
- if (cb_info->serv) {
- /*
- * Note: increase service usage, because later in case of error
- * svc_destroy() will be called.
- */
- svc_get(cb_info->serv);
- return cb_info->serv;
- }
-
- switch (minorversion) {
- case 0:
- sv_ops = nfs4_cb_sv_ops[0];
- break;
- default:
- sv_ops = nfs4_cb_sv_ops[1];
- }
-
- if (sv_ops == NULL)
- return ERR_PTR(-ENOTSUPP);
+ if (cb_info->serv)
+ return svc_get(cb_info->serv);
/*
* Sanity check: if there's no task,
@@ -294,7 +251,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ threadfn = nfs4_callback_svc;
+#if defined(CONFIG_NFS_V4_1)
+ if (minorversion)
+ threadfn = nfs41_callback_svc;
+#else
+ if (minorversion)
+ return ERR_PTR(-ENOTSUPP);
+#endif
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ threadfn);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
@@ -335,16 +301,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto err_start;
cb_info->users++;
- /*
- * svc_create creates the svc_serv with sv_nrthreads == 1, and then
- * svc_prepare_thread increments that. So we need to call svc_destroy
- * on both success and failure so that the refcount is 1 when the
- * thread exits.
- */
err_net:
if (!cb_info->users)
cb_info->serv = NULL;
- svc_destroy(serv);
+ svc_put(serv);
err_create:
mutex_unlock(&nfs_callback_mutex);
return ret;
@@ -369,8 +329,8 @@ void nfs_callback_down(int minorversion, struct net *net)
cb_info->users--;
if (cb_info->users == 0) {
svc_get(serv);
- serv->sv_ops->svo_setup(serv, NULL, 0);
- svc_destroy(serv);
+ svc_set_num_threads(serv, NULL, 0);
+ svc_put(serv);
dprintk("nfs_callback_down: service destroyed\n");
cb_info->serv = NULL;
}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 6a2033131c06..ccd4f245cae2 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -170,7 +170,7 @@ struct cb_devicenotifyitem {
};
struct cb_devicenotifyargs {
- int ndevs;
+ uint32_t ndevs;
struct cb_devicenotifyitem *devs;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ed9d580826f5..c8520284dda7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
struct cb_process_state *cps)
{
struct cb_devicenotifyargs *args = argp;
- int i;
+ const struct pnfs_layoutdriver_type *ld = NULL;
+ uint32_t i;
__be32 res = 0;
- struct nfs_client *clp = cps->clp;
- struct nfs_server *server = NULL;
- if (!clp) {
+ if (!cps->clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
}
@@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
for (i = 0; i < args->ndevs; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- if (!server ||
- server->pnfs_curr_ld->id != dev->cbd_layout_type) {
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
- if (server->pnfs_curr_ld &&
- server->pnfs_curr_ld->id == dev->cbd_layout_type) {
- rcu_read_unlock();
- goto found;
- }
- rcu_read_unlock();
- continue;
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
}
-
- found:
- nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
}
-
+ pnfs_put_layoutdriver(ld);
out:
kfree(args->devs);
return res;
@@ -710,7 +701,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
struct nfs4_copy_state *copy, *tmp_copy;
bool found = false;
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
return htonl(NFS4ERR_SERVERFAULT);
@@ -739,6 +730,9 @@ out:
kfree(copy);
spin_unlock(&cps->clp->cl_lock);
+ trace_nfs4_cb_offload(&args->coa_fh, &args->coa_stateid,
+ args->wr_count, args->error,
+ args->wr_writeverf.committed);
return 0;
}
#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 4c48d85f6517..8dcb08e1a885 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -67,9 +67,9 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
* svc_process_common() looks for an XDR encoder to know when
* not to drop a Reply.
*/
-static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
+static bool nfs4_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return xdr_ressize_check(rqstp, p);
+ return true;
}
static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
@@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
void *argp)
{
struct cb_devicenotifyargs *args = argp;
+ uint32_t tmp, n, i;
__be32 *p;
__be32 status = 0;
- u32 tmp;
- int n, i;
- args->ndevs = 0;
/* Num of device notifications */
p = xdr_inline_decode(xdr, sizeof(uint32_t));
@@ -271,12 +269,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
goto out;
}
n = ntohl(*p++);
- if (n <= 0)
- goto out;
- if (n > ULONG_MAX / sizeof(*args->devs)) {
- status = htonl(NFS4ERR_BADXDR);
+ if (n == 0)
goto out;
- }
args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
@@ -330,19 +324,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
dev->cbd_immediate = 0;
}
- args->ndevs++;
-
dprintk("%s: type %d layout 0x%x immediate %d\n",
__func__, dev->cbd_notify_type, dev->cbd_layout_type,
dev->cbd_immediate);
}
+ args->ndevs = n;
+ dprintk("%s: ndevs %d\n", __func__, args->ndevs);
+ return 0;
+err:
+ kfree(args->devs);
out:
+ args->devs = NULL;
+ args->ndevs = 0;
dprintk("%s: status %d ndevs %d\n",
__func__, ntohl(status), args->ndevs);
return status;
-err:
- kfree(args->devs);
- goto out;
}
static __be32 decode_sessionid(struct xdr_stream *xdr,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 23e165d5ec9c..e828504cc396 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -177,14 +177,13 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_superblocks);
clp->cl_rpcclient = ERR_PTR(-EINVAL);
+ clp->cl_flags = cl_init->init_flags;
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
clp->cl_net = get_net(cl_init->net);
clp->cl_principal = "*";
- nfs_fscache_get_client_cookie(clp);
-
return clp;
error_cleanup:
@@ -238,8 +237,6 @@ static void pnfs_init_server(struct nfs_server *server)
*/
void nfs_free_client(struct nfs_client *clp)
{
- nfs_fscache_release_client_cookie(clp);
-
/* -EIO all pending I/O */
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
@@ -427,7 +424,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
list_add_tail(&new->cl_share_link,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
- new->cl_flags = cl_init->init_flags;
return rpc_ops->init_client(new, cl_init);
}
@@ -828,7 +824,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
/*
* Probe filesystem information, including the FSID on v2/v3
*/
-int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
{
struct nfs_fsinfo fsinfo;
struct nfs_client *clp = server->nfs_client;
@@ -860,9 +856,40 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
server->namelen = pathinfo.max_namelen;
}
+ if (clp->rpc_ops->discover_trunking != NULL &&
+ (server->caps & NFS_CAP_FS_LOCATIONS &&
+ (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) {
+ error = clp->rpc_ops->discover_trunking(server, mntfh);
+ if (error < 0)
+ return error;
+ }
+
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* Sanity: the probe won't work if the destination server
+ * does not recognize the migrated FH. */
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+
+ nfs_free_fattr(fattr);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_probe_server);
/*
* Copy useful information when duplicating a server record
@@ -1025,7 +1052,7 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
if (!(fattr->valid & NFS_ATTR_FATTR)) {
error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
- fattr, NULL, NULL);
+ fattr, NULL);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
@@ -1058,7 +1085,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
rpc_authflavor_t flavor)
{
struct nfs_server *server;
- struct nfs_fattr *fattr_fsinfo;
int error;
server = nfs_alloc_server();
@@ -1067,11 +1093,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
server->cred = get_cred(source->cred);
- error = -ENOMEM;
- fattr_fsinfo = nfs_alloc_fattr();
- if (fattr_fsinfo == NULL)
- goto out_free_server;
-
/* Copy data from the source */
server->nfs_client = source->nfs_client;
server->destroy = source->destroy;
@@ -1087,7 +1108,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
goto out_free_server;
/* probe the filesystem info for this server filesystem */
- error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+ error = nfs_probe_server(server, fh);
if (error < 0)
goto out_free_server;
@@ -1101,11 +1122,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
nfs_server_insert_lists(server);
server->mount_time = jiffies;
- nfs_free_fattr(fattr_fsinfo);
return server;
out_free_server:
- nfs_free_fattr(fattr_fsinfo);
nfs_free_server(server);
return ERR_PTR(error);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 11118398f495..5c97cad741a7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -439,7 +439,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
struct nfs_delegation *freeme = NULL;
int status = 0;
- delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+ delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT);
if (delegation == NULL)
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, stateid);
@@ -755,11 +755,13 @@ int nfs4_inode_return_delegation(struct inode *inode)
struct nfs_delegation *delegation;
delegation = nfs_start_delegation_return(nfsi);
- /* Synchronous recall of any application leases */
- break_lease(inode, O_WRONLY | O_RDWR);
- nfs_wb_all(inode);
- if (delegation != NULL)
+ if (delegation != NULL) {
+ /* Synchronous recall of any application leases */
+ break_lease(inode, O_WRONLY | O_RDWR);
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
return nfs_end_delegation_return(inode, delegation, 1);
+ }
return 0;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1a6d2867fba4..c6b263b5faf1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -18,6 +18,7 @@
* 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
*/
+#include <linux/compat.h>
#include <linux/module.h>
#include <linux/time.h>
#include <linux/errno.h>
@@ -38,6 +39,7 @@
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
+#include <linux/hash.h>
#include "delegation.h"
#include "iostat.h"
@@ -68,23 +70,26 @@ const struct address_space_operations nfs_dir_aops = {
.freepage = nfs_readdir_clear_array,
};
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
+#define NFS_INIT_DTSIZE PAGE_SIZE
+
+static struct nfs_open_dir_context *
+alloc_nfs_open_dir_context(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
- ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
if (ctx != NULL) {
- ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- ctx->dir_cookie = 0;
- ctx->dup_cookie = 0;
+ ctx->dtsize = NFS_INIT_DTSIZE;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
nfs_set_cache_invalid(dir,
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
- list_add(&ctx->list, &nfsi->open_files);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
+ memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
spin_unlock(&dir->i_lock);
return ctx;
}
@@ -94,9 +99,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
spin_lock(&dir->i_lock);
- list_del(&ctx->list);
+ list_del_rcu(&ctx->list);
spin_unlock(&dir->i_lock);
- kfree(ctx);
+ kfree_rcu(ctx, rcu_head);
}
/*
@@ -138,6 +143,7 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
+ u64 change_attr;
u64 last_cookie;
unsigned int size;
unsigned char page_full : 1,
@@ -151,11 +157,10 @@ struct nfs_readdir_descriptor {
struct page *page;
struct dir_context *ctx;
pgoff_t page_index;
+ pgoff_t page_index_max;
u64 dir_cookie;
u64 last_cookie;
- u64 dup_cookie;
loff_t current_index;
- loff_t prev_index;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
unsigned long dir_verifier;
@@ -163,23 +168,47 @@ struct nfs_readdir_descriptor {
unsigned long gencount;
unsigned long attr_gencount;
unsigned int cache_entry_index;
- signed char duped;
+ unsigned int buffer_fills;
+ unsigned int dtsize;
+ bool clear_cache;
bool plus;
+ bool eob;
bool eof;
};
-static void nfs_readdir_array_init(struct nfs_cache_array *array)
+static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(desc->file));
+ unsigned int maxsize = server->dtsize;
+
+ if (sz > maxsize)
+ sz = maxsize;
+ if (sz < NFS_MIN_FILE_IO_SIZE)
+ sz = NFS_MIN_FILE_IO_SIZE;
+ desc->dtsize = sz;
+}
+
+static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc)
+{
+ nfs_set_dtsize(desc, desc->dtsize >> 1);
+}
+
+static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
{
- memset(array, 0, sizeof(struct nfs_cache_array));
+ nfs_set_dtsize(desc, desc->dtsize << 1);
}
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
+ u64 change_attr)
{
struct nfs_cache_array *array;
array = kmap_atomic(page);
- nfs_readdir_array_init(array);
+ array->change_attr = change_attr;
array->last_cookie = last_cookie;
+ array->size = 0;
+ array->page_full = 0;
+ array->page_is_eof = 0;
array->cookies_are_ordered = 1;
kunmap_atomic(array);
}
@@ -187,25 +216,31 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
-static
-void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct page *page)
{
struct nfs_cache_array *array;
- int i;
+ unsigned int i;
array = kmap_atomic(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].name);
- nfs_readdir_array_init(array);
+ array->size = 0;
kunmap_atomic(array);
}
+static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
+ u64 change_attr)
+{
+ nfs_readdir_clear_array(page);
+ nfs_readdir_page_init_array(page, last_cookie, change_attr);
+}
+
static struct page *
nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
{
struct page *page = alloc_page(gfp_flags);
if (page)
- nfs_readdir_page_init_array(page, last_cookie);
+ nfs_readdir_page_init_array(page, last_cookie, 0);
return page;
}
@@ -217,6 +252,11 @@ static void nfs_readdir_page_array_free(struct page *page)
}
}
+static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
+{
+ return array->size == 0 ? array->last_cookie : array->array[0].cookie;
+}
+
static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
{
array->page_is_eof = 1;
@@ -246,36 +286,40 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
return ret;
}
+static size_t nfs_readdir_array_maxentries(void)
+{
+ return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
+ sizeof(struct nfs_cache_array_entry);
+}
+
/*
* Check that the next array entry lies entirely within the page bounds
*/
static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
{
- struct nfs_cache_array_entry *cache_entry;
-
if (array->page_full)
return -ENOSPC;
- cache_entry = &array->array[array->size + 1];
- if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+ if (array->size == nfs_readdir_array_maxentries()) {
array->page_full = 1;
return -ENOSPC;
}
return 0;
}
-static
-int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+static int nfs_readdir_page_array_append(struct page *page,
+ const struct nfs_entry *entry,
+ u64 *cookie)
{
struct nfs_cache_array *array;
struct nfs_cache_array_entry *cache_entry;
const char *name;
- int ret;
+ int ret = -ENOMEM;
name = nfs_readdir_copy_name(entry->name, entry->len);
- if (!name)
- return -ENOMEM;
array = kmap_atomic(page);
+ if (!name)
+ goto out;
ret = nfs_readdir_array_can_expand(array);
if (ret) {
kfree(name);
@@ -283,7 +327,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
}
cache_entry = &array->array[array->size];
- cache_entry->cookie = entry->prev_cookie;
+ cache_entry->cookie = array->last_cookie;
cache_entry->ino = entry->ino;
cache_entry->d_type = entry->d_type;
cache_entry->name_len = entry->len;
@@ -295,23 +339,72 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
if (entry->eof != 0)
nfs_readdir_array_set_eof(array);
out:
+ *cookie = array->last_cookie;
+ kunmap_atomic(array);
+ return ret;
+}
+
+#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
+/*
+ * Hash algorithm allowing content addressible access to sequences
+ * of directory cookies. Content is addressed by the value of the
+ * cookie index of the first readdir entry in a page.
+ *
+ * We select only the first 18 bits to avoid issues with excessive
+ * memory use for the page cache XArray. 18 bits should allow the caching
+ * of 262144 pages of sequences of readdir entries. Since each page holds
+ * 127 readdir entries for a typical 64-bit system, that works out to a
+ * cache of ~ 33 million entries per directory.
+ */
+static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+{
+ if (cookie == 0)
+ return 0;
+ return hash_64(cookie, 18);
+}
+
+static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
+ u64 change_attr)
+{
+ struct nfs_cache_array *array = kmap_atomic(page);
+ int ret = true;
+
+ if (array->change_attr != change_attr)
+ ret = false;
+ if (nfs_readdir_array_index_cookie(array) != last_cookie)
+ ret = false;
kunmap_atomic(array);
return ret;
}
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+ unlock_page(page);
+ put_page(page);
+}
+
+static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie,
+ u64 change_attr)
+{
+ if (PageUptodate(page)) {
+ if (nfs_readdir_page_validate(page, cookie, change_attr))
+ return;
+ nfs_readdir_clear_array(page);
+ }
+ nfs_readdir_page_init_array(page, cookie, change_attr);
+ SetPageUptodate(page);
+}
+
static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
- pgoff_t index, u64 last_cookie)
+ u64 cookie, u64 change_attr)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
struct page *page;
page = grab_cache_page(mapping, index);
- if (page && !PageUptodate(page)) {
- nfs_readdir_page_init_array(page, last_cookie);
- if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
- nfs_zap_mapping(mapping->host, mapping);
- SetPageUptodate(page);
- }
-
+ if (!page)
+ return NULL;
+ nfs_readdir_page_init_and_validate(page, cookie, change_attr);
return page;
}
@@ -346,24 +439,19 @@ static void nfs_readdir_page_set_eof(struct page *page)
kunmap_atomic(array);
}
-static void nfs_readdir_page_unlock_and_put(struct page *page)
-{
- unlock_page(page);
- put_page(page);
-}
-
static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
- pgoff_t index, u64 cookie)
+ u64 cookie, u64 change_attr)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
struct page *page;
- page = nfs_readdir_page_get_locked(mapping, index, cookie);
- if (page) {
- if (nfs_readdir_page_last_cookie(page) == cookie)
- return page;
- nfs_readdir_page_unlock_and_put(page);
- }
- return NULL;
+ page = grab_cache_page_nowait(mapping, index);
+ if (!page)
+ return NULL;
+ nfs_readdir_page_init_and_validate(page, cookie, change_attr);
+ if (nfs_readdir_page_last_cookie(page) != cookie)
+ nfs_readdir_page_reinit_array(page, cookie, change_attr);
+ return page;
}
static inline
@@ -385,6 +473,25 @@ bool nfs_readdir_use_cookie(const struct file *filp)
return true;
}
+static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
+ struct nfs_readdir_descriptor *desc)
+{
+ if (array->page_full) {
+ desc->last_cookie = array->last_cookie;
+ desc->current_index += array->size;
+ desc->cache_entry_index = 0;
+ desc->page_index++;
+ } else
+ desc->last_cookie = nfs_readdir_array_index_cookie(array);
+}
+
+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ desc->page_index = 0;
+}
+
static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
@@ -396,6 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
if (diff >= array->size) {
if (array->page_is_eof)
goto out_eof;
+ nfs_readdir_seek_next_array(array, desc);
return -EAGAIN;
}
@@ -408,15 +516,6 @@ out_eof:
return -EBADCOOKIE;
}
-static bool
-nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
-{
- if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
- return false;
- smp_rmb();
- return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
-}
-
static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
u64 cookie)
{
@@ -433,8 +532,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
- int i;
- loff_t new_pos;
+ unsigned int i;
int status = -EAGAIN;
if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
@@ -442,33 +540,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
for (i = 0; i < array->size; i++) {
if (array->array[i].cookie == desc->dir_cookie) {
- struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-
- new_pos = desc->current_index + i;
- if (desc->attr_gencount != nfsi->attr_gencount ||
- !nfs_readdir_inode_mapping_valid(nfsi)) {
- desc->duped = 0;
- desc->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->prev_index) {
- if (desc->duped > 0
- && desc->dup_cookie == desc->dir_cookie) {
- if (printk_ratelimit()) {
- pr_notice("NFS: directory %pD2 contains a readdir loop."
- "Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file, array->array[i].name, desc->dir_cookie);
- }
- status = -ELOOP;
- goto out;
- }
- desc->dup_cookie = desc->dir_cookie;
- desc->duped = -1;
- }
if (nfs_readdir_use_cookie(desc->file))
desc->ctx->pos = desc->dir_cookie;
else
- desc->ctx->pos = new_pos;
- desc->prev_index = new_pos;
+ desc->ctx->pos = desc->current_index + i;
desc->cache_entry_index = i;
return 0;
}
@@ -478,8 +553,8 @@ check_eof:
status = -EBADCOOKIE;
if (desc->dir_cookie == array->last_cookie)
desc->eof = true;
- }
-out:
+ } else
+ nfs_readdir_seek_next_array(array, desc);
return status;
}
@@ -495,11 +570,6 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
- if (status == -EAGAIN) {
- desc->last_cookie = array->last_cookie;
- desc->current_index += array->size;
- desc->page_index++;
- }
kunmap_atomic(array);
return status;
}
@@ -535,7 +605,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
/* We requested READDIRPLUS, but the server doesn't grok it */
if (error == -ENOTSUPP && desc->plus) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
- clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
desc->plus = arg.plus = false;
goto again;
}
@@ -585,52 +654,68 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
return 1;
}
-static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+ unsigned int cache_hits,
+ unsigned int cache_misses)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
- if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
- return true;
- if (ctx->pos == 0)
+ if (ctx->pos == 0 ||
+ cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
return false;
}
/*
- * This function is called by the lookup and getattr code to request the
+ * This function is called by the getattr code to request the
* use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-void nfs_advise_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files))
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_hits);
+ rcu_read_unlock();
+ }
}
/*
* This function is mainly for use by nfs_getattr().
*
* If this is an 'ls -l', we want to force use of readdirplus.
- * Do this by checking if there is an active file descriptor
- * and calling nfs_advise_use_readdirplus, then forcing a
- * cache flush.
*/
-void nfs_force_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files)) {
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- invalidate_mapping_pages(dir->i_mapping,
- nfsi->page_index + 1, -1);
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_misses);
+ rcu_read_unlock();
}
}
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir,
+ unsigned int flags)
+{
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ return;
+ if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL))
+ return;
+ nfs_readdir_record_entry_cache_miss(dir);
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
unsigned long dir_verifier)
@@ -680,9 +765,13 @@ again:
nfs_set_verifier(dentry, dir_verifier);
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
- nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
+ nfs_setsecurity(d_inode(dentry), entry->fattr);
+ trace_nfs_readdir_lookup_revalidate(d_inode(parent),
+ dentry, 0, status);
goto out;
} else {
+ trace_nfs_readdir_lookup_revalidate_failed(
+ d_inode(parent), dentry, 0);
d_invalidate(dentry);
dput(dentry);
dentry = NULL;
@@ -694,7 +783,7 @@ again:
goto out;
}
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
alias = d_splice_alias(inode, dentry);
d_lookup_done(dentry);
if (alias) {
@@ -704,22 +793,38 @@ again:
dentry = alias;
}
nfs_set_verifier(dentry, dir_verifier);
+ trace_nfs_readdir_lookup(d_inode(parent), dentry, 0);
out:
dput(dentry);
}
+static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry,
+ struct xdr_stream *stream)
+{
+ int ret;
+
+ if (entry->fattr->label)
+ entry->fattr->label->len = NFS4_MAXLABELLEN;
+ ret = xdr_decode(desc, entry, stream);
+ if (ret || !desc->plus)
+ return ret;
+ nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier);
+ return 0;
+}
+
/* Perform conversion from xdr to cache array */
static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
struct nfs_entry *entry,
- struct page **xdr_pages,
- unsigned int buflen,
- struct page **arrays,
- size_t narrays)
+ struct page **xdr_pages, unsigned int buflen,
+ struct page **arrays, size_t narrays,
+ u64 change_attr)
{
struct address_space *mapping = desc->file->f_mapping;
struct xdr_stream stream;
struct xdr_buf buf;
struct page *scratch, *new, *page = *arrays;
+ u64 cookie;
int status;
scratch = alloc_page(GFP_KERNEL);
@@ -730,54 +835,50 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
xdr_set_scratch_page(&stream, scratch);
do {
- if (entry->label)
- entry->label->len = NFS4_MAXLABELLEN;
-
- status = xdr_decode(desc, entry, &stream);
+ status = nfs_readdir_entry_decode(desc, entry, &stream);
if (status != 0)
break;
- if (desc->plus)
- nfs_prime_dcache(file_dentry(desc->file), entry,
- desc->dir_verifier);
-
- status = nfs_readdir_add_to_array(entry, page);
+ status = nfs_readdir_page_array_append(page, entry, &cookie);
if (status != -ENOSPC)
continue;
if (page->mapping != mapping) {
if (!--narrays)
break;
- new = nfs_readdir_page_array_alloc(entry->prev_cookie,
- GFP_KERNEL);
+ new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL);
if (!new)
break;
arrays++;
*arrays = page = new;
} else {
- new = nfs_readdir_page_get_next(mapping,
- page->index + 1,
- entry->prev_cookie);
+ new = nfs_readdir_page_get_next(mapping, cookie,
+ change_attr);
if (!new)
break;
if (page != *arrays)
nfs_readdir_page_unlock_and_put(page);
page = new;
}
- status = nfs_readdir_add_to_array(entry, page);
+ desc->page_index_max++;
+ status = nfs_readdir_page_array_append(page, entry, &cookie);
} while (!status && !entry->eof);
switch (status) {
case -EBADCOOKIE:
- if (entry->eof) {
- nfs_readdir_page_set_eof(page);
- status = 0;
- }
- break;
- case -ENOSPC:
+ if (!entry->eof)
+ break;
+ nfs_readdir_page_set_eof(page);
+ fallthrough;
case -EAGAIN:
status = 0;
break;
+ case -ENOSPC:
+ status = 0;
+ if (!desc->plus)
+ break;
+ while (!nfs_readdir_entry_decode(desc, entry, &stream))
+ ;
}
if (page != *arrays)
@@ -823,12 +924,14 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
__be32 *verf_arg, __be32 *verf_res,
struct page **arrays, size_t narrays)
{
+ u64 change_attr;
struct page **pages;
struct page *page = *arrays;
struct nfs_entry *entry;
size_t array_size;
struct inode *inode = file_inode(desc->file);
- size_t dtsize = NFS_SERVER(inode)->dtsize;
+ unsigned int dtsize = desc->dtsize;
+ unsigned int pglen;
int status = -ENOMEM;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -836,45 +939,32 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
return -ENOMEM;
entry->cookie = nfs_readdir_page_last_cookie(page);
entry->fh = nfs_alloc_fhandle();
- entry->fattr = nfs_alloc_fattr();
+ entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
entry->server = NFS_SERVER(inode);
if (entry->fh == NULL || entry->fattr == NULL)
goto out;
- entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
- if (IS_ERR(entry->label)) {
- status = PTR_ERR(entry->label);
- goto out;
- }
-
array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = nfs_readdir_alloc_pages(array_size);
if (!pages)
- goto out_release_label;
-
- do {
- unsigned int pglen;
- status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
- pages, dtsize,
- verf_res);
- if (status < 0)
- break;
-
- pglen = status;
- if (pglen == 0) {
- nfs_readdir_page_set_eof(page);
- break;
- }
+ goto out;
- verf_arg = verf_res;
+ change_attr = inode_peek_iversion_raw(inode);
+ status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
+ dtsize, verf_res);
+ if (status < 0)
+ goto free_pages;
+ pglen = status;
+ if (pglen != 0)
status = nfs_readdir_page_filler(desc, entry, pages, pglen,
- arrays, narrays);
- } while (!status && nfs_readdir_page_needs_filling(page));
+ arrays, narrays, change_attr);
+ else
+ nfs_readdir_page_set_eof(page);
+ desc->buffer_fills++;
+free_pages:
nfs_readdir_free_pages(pages, array_size);
-out_release_label:
- nfs4_label_free(entry->label);
out:
nfs_free_fattr(entry->fattr);
nfs_free_fhandle(entry->fh);
@@ -898,9 +988,17 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
static struct page *
nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
{
- return nfs_readdir_page_get_locked(desc->file->f_mapping,
- desc->page_index,
- desc->last_cookie);
+ struct address_space *mapping = desc->file->f_mapping;
+ u64 change_attr = inode_peek_iversion_raw(mapping->host);
+ u64 cookie = desc->last_cookie;
+ struct page *page;
+
+ page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
+ if (!page)
+ return NULL;
+ if (desc->clear_cache && !nfs_readdir_page_needs_filling(page))
+ nfs_readdir_page_reinit_array(page, cookie, change_attr);
+ return page;
}
/*
@@ -918,13 +1016,23 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (!desc->page)
return -ENOMEM;
if (nfs_readdir_page_needs_filling(desc->page)) {
+ /* Grow the dtsize if we had to go back for more pages */
+ if (desc->page_index == desc->page_index_max)
+ nfs_grow_dtsize(desc);
+ desc->page_index_max = desc->page_index;
+ trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
+ desc->last_cookie,
+ desc->page->index, desc->dtsize);
res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
&desc->page, 1);
if (res < 0) {
nfs_readdir_page_unlock_and_put_cached(desc);
+ trace_nfs_readdir_cache_fill_done(inode, res);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
- desc->page_index = 0;
+ nfs_readdir_rewind_search(desc);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 0, MAX_LFS_FILESIZE);
return -EAGAIN;
}
return res;
@@ -932,47 +1040,30 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
/*
* Set the cookie verifier if the page cache was empty
*/
- if (desc->page_index == 0)
+ if (desc->last_cookie == 0 &&
+ memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
memcpy(nfsi->cookieverf, verf,
sizeof(nfsi->cookieverf));
+ invalidate_inode_pages2_range(desc->file->f_mapping, 1,
+ -1);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 1, MAX_LFS_FILESIZE);
+ }
+ desc->clear_cache = false;
}
res = nfs_readdir_search_array(desc);
- if (res == 0) {
- nfsi->page_index = desc->page_index;
+ if (res == 0)
return 0;
- }
nfs_readdir_page_unlock_and_put_cached(desc);
return res;
}
-static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
-{
- struct address_space *mapping = desc->file->f_mapping;
- struct inode *dir = file_inode(desc->file);
- unsigned int dtsize = NFS_SERVER(dir)->dtsize;
- loff_t size = i_size_read(dir);
-
- /*
- * Default to uncached readdir if the page cache is empty, and
- * we're looking for a non-zero cookie in a large directory.
- */
- return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
-}
-
/* Search for desc->dir_cookie from the beginning of the page cache */
static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
{
int res;
- if (nfs_readdir_dont_search_cache(desc))
- return -EBADCOOKIE;
-
do {
- if (desc->page_index == 0) {
- desc->current_index = 0;
- desc->prev_index = 0;
- desc->last_cookie = 0;
- }
res = find_and_lock_cache_page(desc);
} while (res == -EAGAIN);
return res;
@@ -986,7 +1077,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
{
struct file *file = desc->file;
struct nfs_cache_array *array;
- unsigned int i = 0;
+ unsigned int i;
array = kmap(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -995,23 +1086,24 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
ent = &array->array[i];
if (!dir_emit(desc->ctx, ent->name, ent->name_len,
nfs_compat_user_ino64(ent->ino), ent->d_type)) {
- desc->eof = true;
+ desc->eob = true;
break;
}
memcpy(desc->verf, verf, sizeof(desc->verf));
- if (i < (array->size-1))
- desc->dir_cookie = array->array[i+1].cookie;
- else
+ if (i == array->size - 1) {
desc->dir_cookie = array->last_cookie;
+ nfs_readdir_seek_next_array(array, desc);
+ } else {
+ desc->dir_cookie = array->array[i + 1].cookie;
+ desc->last_cookie = array->array[0].cookie;
+ }
if (nfs_readdir_use_cookie(file))
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
- if (desc->duped != 0)
- desc->duped = 1;
}
if (array->page_is_eof)
- desc->eof = true;
+ desc->eof = !desc->eob;
kunmap(desc->page);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
@@ -1048,26 +1140,63 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
goto out;
desc->page_index = 0;
+ desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->duped = 0;
+ desc->page_index_max = 0;
+
+ trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
+ -1, desc->dtsize);
status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
+ if (status < 0) {
+ trace_nfs_readdir_uncached_done(file_inode(desc->file), status);
+ goto out_free;
+ }
- for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
+ for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
desc->page = arrays[i];
nfs_do_filldir(desc, verf);
}
desc->page = NULL;
-
+ /*
+ * Grow the dtsize if we have to go back for more pages,
+ * or shrink it if we're reading too many.
+ */
+ if (!desc->eof) {
+ if (!desc->eob)
+ nfs_grow_dtsize(desc);
+ else if (desc->buffer_fills == 1 &&
+ i < (desc->page_index_max >> 1))
+ nfs_shrink_dtsize(desc);
+ }
+out_free:
for (i = 0; i < sz && arrays[i]; i++)
nfs_readdir_page_array_free(arrays[i]);
out:
+ if (!nfs_readdir_use_cookie(desc->file))
+ nfs_readdir_rewind_search(desc);
+ desc->page_index_max = -1;
kfree(arrays);
dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
return status;
}
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+static bool nfs_readdir_handle_cache_misses(struct inode *inode,
+ struct nfs_readdir_descriptor *desc,
+ unsigned int cache_misses,
+ bool force_clear)
+{
+ if (desc->ctx->pos == 0 || !desc->plus)
+ return false;
+ if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear)
+ return false;
+ trace_nfs_readdir_force_readdirplus(inode);
+ return true;
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -1079,6 +1208,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
+ unsigned int cache_hits, cache_misses;
+ bool force_clear;
int res;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1091,11 +1222,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
- res = nfs_revalidate_mapping(inode, file->f_mapping);
- if (res < 0)
- goto out;
- }
+ nfs_revalidate_mapping(inode, file->f_mapping);
res = -ENOMEM;
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -1103,16 +1230,31 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out;
desc->file = file;
desc->ctx = ctx;
- desc->plus = nfs_use_readdirplus(inode, ctx);
+ desc->page_index_max = -1;
spin_lock(&file->f_lock);
desc->dir_cookie = dir_ctx->dir_cookie;
- desc->dup_cookie = dir_ctx->dup_cookie;
- desc->duped = dir_ctx->duped;
+ desc->page_index = dir_ctx->page_index;
+ desc->last_cookie = dir_ctx->last_cookie;
desc->attr_gencount = dir_ctx->attr_gencount;
+ desc->eof = dir_ctx->eof;
+ nfs_set_dtsize(desc, dir_ctx->dtsize);
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+ cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
+ force_clear = dir_ctx->force_clear;
spin_unlock(&file->f_lock);
+ if (desc->eof) {
+ res = 0;
+ goto out_free;
+ }
+
+ desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+ force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses,
+ force_clear);
+ desc->clear_cache = force_clear;
+
do {
res = readdir_search_pagecache(desc);
@@ -1130,9 +1272,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
}
if (res == -ETOOSMALL && desc->plus) {
- clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
nfs_zap_caches(inode);
- desc->page_index = 0;
desc->plus = false;
desc->eof = false;
continue;
@@ -1142,16 +1282,21 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
nfs_do_filldir(desc, nfsi->cookieverf);
nfs_readdir_page_unlock_and_put_cached(desc);
- } while (!desc->eof);
+ if (desc->page_index == desc->page_index_max)
+ desc->clear_cache = force_clear;
+ } while (!desc->eob && !desc->eof);
spin_lock(&file->f_lock);
dir_ctx->dir_cookie = desc->dir_cookie;
- dir_ctx->dup_cookie = desc->dup_cookie;
- dir_ctx->duped = desc->duped;
+ dir_ctx->last_cookie = desc->last_cookie;
dir_ctx->attr_gencount = desc->attr_gencount;
+ dir_ctx->page_index = desc->page_index;
+ dir_ctx->force_clear = force_clear;
+ dir_ctx->eof = desc->eof;
+ dir_ctx->dtsize = desc->dtsize;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
spin_unlock(&file->f_lock);
-
+out_free:
kfree(desc);
out:
@@ -1186,13 +1331,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- if (nfs_readdir_use_cookie(filp))
- dir_ctx->dir_cookie = offset;
- else
+ dir_ctx->page_index = 0;
+ if (!nfs_readdir_use_cookie(filp)) {
dir_ctx->dir_cookie = 0;
- if (offset == 0)
- memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
- dir_ctx->duped = 0;
+ dir_ctx->last_cookie = 0;
+ } else {
+ dir_ctx->dir_cookie = offset;
+ dir_ctx->last_cookie = offset;
+ }
+ dir_ctx->eof = false;
}
spin_unlock(&filp->f_lock);
return offset;
@@ -1269,13 +1416,12 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
+ struct inode *dir = d_inode(dentry->d_parent);
- if (!nfs_verifier_is_delegated(dentry) &&
- !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf))
- goto out;
+ if (!nfs_verify_change_attribute(dir, verf))
+ return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);
-out:
dentry->d_time = verf;
}
@@ -1326,6 +1472,14 @@ void nfs_clear_verifier_delegated(struct inode *inode)
EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated);
#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry)
+{
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) &&
+ d_really_is_negative(dentry))
+ return dentry->d_time == inode_peek_iversion_raw(dir);
+ return nfs_verify_change_attribute(dir, dentry->d_time);
+}
+
/*
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
@@ -1339,7 +1493,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
return 0;
- if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ if (!nfs_dentry_verify_change(dir, dentry))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
if (nfs_mapping_need_revalidate_inode(dir)) {
@@ -1348,7 +1502,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
return 0;
}
- if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ if (!nfs_dentry_verify_change(dir, dentry))
return 0;
return 1;
}
@@ -1400,7 +1554,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
if (flags & LOOKUP_REVAL)
goto out_force;
out:
- return (inode->i_nlink == 0) ? -ESTALE : 0;
+ if (inode->i_nlink > 0 ||
+ (inode->i_nlink == 0 &&
+ test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags)))
+ return 0;
+ else
+ return -ESTALE;
out_force:
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -1413,7 +1572,7 @@ out_force:
static void nfs_mark_dir_for_revalidate(struct inode *inode)
{
spin_lock(&inode->i_lock);
- nfs_set_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
spin_unlock(&inode->i_lock);
}
@@ -1438,6 +1597,9 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return 0;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
return 1;
+ /* Case insensitive server? Revalidate negative dentries */
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ return 1;
return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
@@ -1447,9 +1609,7 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
{
switch (error) {
case 1:
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
- __func__, dentry);
- return 1;
+ break;
case 0:
/*
* We can't d_drop the root of a disconnected tree:
@@ -1458,13 +1618,10 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
* inodes on unmount and further oopses.
*/
if (inode && IS_ROOT(dentry))
- return 1;
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
- __func__, dentry);
- return 0;
+ error = 1;
+ break;
}
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
- __func__, dentry, error);
+ trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error);
return error;
}
@@ -1489,25 +1646,25 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
}
-static int
-nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
- struct inode *inode)
+static int nfs_lookup_revalidate_dentry(struct inode *dir,
+ struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
{
struct nfs_fh *fhandle;
struct nfs_fattr *fattr;
- struct nfs4_label *label;
unsigned long dir_verifier;
int ret;
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
ret = -ENOMEM;
fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fhandle == NULL || fattr == NULL)
goto out;
dir_verifier = nfs_save_change_attribute(dir);
- ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
+ ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
if (ret < 0) {
switch (ret) {
case -ESTALE:
@@ -1520,28 +1677,29 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
}
goto out;
}
+
+ /* Request help from readdirplus */
+ nfs_lookup_advise_force_readdirplus(dir, flags);
+
ret = 0;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
goto out;
if (nfs_refresh_inode(inode, fattr) < 0)
goto out;
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
nfs_set_verifier(dentry, dir_verifier);
- /* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
ret = 1;
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
/*
* If the lookup failed despite the dentry change attribute being
* a match, then we should revalidate the directory cache.
*/
- if (!ret && nfs_verify_change_attribute(dir, dentry->d_time))
+ if (!ret && nfs_dentry_verify_change(dir, dentry))
nfs_mark_dir_for_revalidate(dir);
return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
}
@@ -1588,7 +1746,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
- nfs_advise_use_readdirplus(dir);
goto out_valid;
}
@@ -1598,10 +1755,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
if (NFS_STALE(inode))
goto out_bad;
- trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
- error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
- trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
- return error;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
out_valid:
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
out_bad:
@@ -1721,10 +1875,6 @@ static void nfs_drop_nlink(struct inode *inode)
*/
static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
{
- if (S_ISDIR(inode->i_mode))
- /* drop any readdir cache as it could easily be old */
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
-
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
nfs_complete_unlink(dentry, inode);
nfs_drop_nlink(inode);
@@ -1759,7 +1909,6 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
unsigned long dir_verifier;
int error;
@@ -1778,49 +1927,54 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
res = ERR_PTR(-ENOMEM);
fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir));
if (fhandle == NULL || fattr == NULL)
goto out;
- label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
- if (IS_ERR(label))
- goto out;
-
dir_verifier = nfs_save_change_attribute(dir);
trace_nfs_lookup_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
- if (error == -ENOENT)
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ if (error == -ENOENT) {
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ dir_verifier = inode_peek_iversion_raw(dir);
goto no_entry;
+ }
if (error < 0) {
res = ERR_PTR(error);
- goto out_label;
+ goto out;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
res = ERR_CAST(inode);
if (IS_ERR(res))
- goto out_label;
+ goto out;
/* Notify readdir to use READDIRPLUS */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir, flags);
no_entry:
res = d_splice_alias(inode, dentry);
if (res != NULL) {
if (IS_ERR(res))
- goto out_label;
+ goto out;
dentry = res;
}
nfs_set_verifier(dentry, dir_verifier);
-out_label:
- trace_nfs_lookup_exit(dir, dentry, flags, error);
- nfs4_label_free(label);
out:
+ trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res));
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
return res;
}
EXPORT_SYMBOL_GPL(nfs_lookup);
+void nfs_d_prune_case_insensitive_aliases(struct inode *inode)
+{
+ /* Case insensitive server? Revalidate dentries */
+ if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE))
+ d_prune_aliases(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases);
+
#if IS_ENABLED(CONFIG_NFS_V4)
static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
@@ -1834,16 +1988,6 @@ const struct dentry_operations nfs4_dentry_operations = {
};
EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
-static fmode_t flags_to_mode(int flags)
-{
- fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
- if ((flags & O_ACCMODE) != O_WRONLY)
- res |= FMODE_READ;
- if ((flags & O_ACCMODE) != O_RDONLY)
- res |= FMODE_WRITE;
- return res;
-}
-
static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
@@ -1882,6 +2026,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct iattr attr = { .ia_valid = ATTR_OPEN };
struct inode *inode;
unsigned int lookup_flags = 0;
+ unsigned long dir_verifier;
bool switched = false;
int created = 0;
int err;
@@ -1955,7 +2100,11 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
switch (err) {
case -ENOENT:
d_splice_alias(NULL, dentry);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ dir_verifier = inode_peek_iversion_raw(dir);
+ else
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
break;
case -EISDIR:
case -ENOTDIR:
@@ -1983,6 +2132,24 @@ out:
no_open:
res = nfs_lookup(dir, dentry, lookup_flags);
+ if (!res) {
+ inode = d_inode(dentry);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+ res = ERR_PTR(-ENOTDIR);
+ else if (inode && S_ISREG(inode->i_mode))
+ res = ERR_PTR(-EOPENSTALE);
+ } else if (!IS_ERR(res)) {
+ inode = d_inode(res);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
+ dput(res);
+ res = ERR_PTR(-ENOTDIR);
+ } else if (inode && S_ISREG(inode->i_mode)) {
+ dput(res);
+ res = ERR_PTR(-EOPENSTALE);
+ }
+ }
if (switched) {
d_lookup_done(dentry);
if (!res)
@@ -2035,7 +2202,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode);
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
full_reval:
return nfs_do_lookup_revalidate(dir, dentry, flags);
@@ -2051,8 +2218,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *
nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct dentry *parent = dget_parent(dentry);
struct inode *dir = d_inode(parent);
@@ -2063,7 +2229,7 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
d_drop(dentry);
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, NULL);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
if (error)
goto out_error;
}
@@ -2071,11 +2237,11 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
if (!(fattr->valid & NFS_ATTR_FATTR)) {
struct nfs_server *server = NFS_SB(dentry->d_sb);
error = server->nfs_client->rpc_ops->getattr(server, fhandle,
- fattr, NULL, NULL);
+ fattr, NULL);
if (error < 0)
goto out_error;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
d = d_splice_alias(inode, dentry);
out:
dput(parent);
@@ -2090,12 +2256,11 @@ EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct dentry *d;
- d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+ d = nfs_add_or_obtain(dentry, fhandle, fattr);
if (IS_ERR(d))
return PTR_ERR(d);
@@ -2197,6 +2362,20 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry)
d_delete(dentry);
}
+static void nfs_dentry_remove_handle_error(struct inode *dir,
+ struct dentry *dentry, int error)
+{
+ switch (error) {
+ case -ENOENT:
+ d_delete(dentry);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ break;
+ case 0:
+ nfs_d_prune_case_insensitive_aliases(d_inode(dentry));
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ }
+}
+
int nfs_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
@@ -2219,6 +2398,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
} else
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ nfs_dentry_remove_handle_error(dir, dentry, error);
trace_nfs_rmdir_exit(dir, dentry, error);
return error;
@@ -2275,7 +2455,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
trace_nfs_unlink_enter(dir, dentry);
spin_lock(&dentry->d_lock);
- if (d_count(dentry) > 1) {
+ if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED,
+ &NFS_I(d_inode(dentry))->flags)) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(d_inode(dentry), 0);
@@ -2288,9 +2469,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
}
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
- if (!error || error == -ENOENT) {
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- } else if (need_rehash)
+ nfs_dentry_remove_handle_error(dir, dentry, error);
+ if (need_rehash)
d_rehash(dentry);
out:
trace_nfs_unlink_exit(dir, dentry, error);
@@ -2352,6 +2532,8 @@ int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return error;
}
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
/*
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
@@ -2383,8 +2565,11 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
trace_nfs_link_enter(inode, dir, dentry);
d_drop(dentry);
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
ihold(inode);
d_add(dentry, inode);
}
@@ -2471,6 +2656,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
}
}
+ if (S_ISREG(old_inode->i_mode))
+ nfs_sync_inode(old_inode);
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
@@ -2531,7 +2718,7 @@ MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache lengt
static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
- put_cred(entry->cred);
+ put_group_info(entry->group_info);
kfree_rcu(entry, rcu_head);
smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
@@ -2657,6 +2844,43 @@ void nfs_access_zap_cache(struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
+static int access_cmp(const struct cred *a, const struct nfs_access_entry *b)
+{
+ struct group_info *ga, *gb;
+ int g;
+
+ if (uid_lt(a->fsuid, b->fsuid))
+ return -1;
+ if (uid_gt(a->fsuid, b->fsuid))
+ return 1;
+
+ if (gid_lt(a->fsgid, b->fsgid))
+ return -1;
+ if (gid_gt(a->fsgid, b->fsgid))
+ return 1;
+
+ ga = a->group_info;
+ gb = b->group_info;
+ if (ga == gb)
+ return 0;
+ if (ga == NULL)
+ return -1;
+ if (gb == NULL)
+ return 1;
+ if (ga->ngroups < gb->ngroups)
+ return -1;
+ if (ga->ngroups > gb->ngroups)
+ return 1;
+
+ for (g = 0; g < ga->ngroups; g++) {
+ if (gid_lt(ga->gid[g], gb->gid[g]))
+ return -1;
+ if (gid_gt(ga->gid[g], gb->gid[g]))
+ return 1;
+ }
+ return 0;
+}
+
static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred)
{
struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
@@ -2664,7 +2888,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
while (n != NULL) {
struct nfs_access_entry *entry =
rb_entry(n, struct nfs_access_entry, rb_node);
- int cmp = cred_fscmp(cred, entry->cred);
+ int cmp = access_cmp(cred, entry);
if (cmp < 0)
n = n->rb_left;
@@ -2676,7 +2900,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
return NULL;
}
-static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
+static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
@@ -2706,8 +2930,7 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *
spin_lock(&inode->i_lock);
retry = false;
}
- res->cred = cache->cred;
- res->mask = cache->mask;
+ *mask = cache->mask;
list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
err = 0;
out:
@@ -2719,7 +2942,7 @@ out_zap:
return -ENOENT;
}
-static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask)
{
/* Only check the most recently returned cache entry,
* but do it without locking.
@@ -2735,35 +2958,36 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru));
cache = list_entry(lh, struct nfs_access_entry, lru);
if (lh == &nfsi->access_cache_entry_lru ||
- cred_fscmp(cred, cache->cred) != 0)
+ access_cmp(cred, cache) != 0)
cache = NULL;
if (cache == NULL)
goto out;
if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
goto out;
- res->cred = cache->cred;
- res->mask = cache->mask;
+ *mask = cache->mask;
err = 0;
out:
rcu_read_unlock();
return err;
}
-int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct
-nfs_access_entry *res, bool may_block)
+int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
+ u32 *mask, bool may_block)
{
int status;
- status = nfs_access_get_cached_rcu(inode, cred, res);
+ status = nfs_access_get_cached_rcu(inode, cred, mask);
if (status != 0)
- status = nfs_access_get_cached_locked(inode, cred, res,
+ status = nfs_access_get_cached_locked(inode, cred, mask,
may_block);
return status;
}
EXPORT_SYMBOL_GPL(nfs_access_get_cached);
-static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+static void nfs_access_add_rbtree(struct inode *inode,
+ struct nfs_access_entry *set,
+ const struct cred *cred)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct rb_root *root_node = &nfsi->access_cache;
@@ -2776,7 +3000,7 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *
while (*p != NULL) {
parent = *p;
entry = rb_entry(parent, struct nfs_access_entry, rb_node);
- cmp = cred_fscmp(set->cred, entry->cred);
+ cmp = access_cmp(cred, entry);
if (cmp < 0)
p = &parent->rb_left;
@@ -2798,13 +3022,16 @@ found:
nfs_access_free_entry(entry);
}
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set,
+ const struct cred *cred)
{
struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
if (cache == NULL)
return;
RB_CLEAR_NODE(&cache->rb_node);
- cache->cred = get_cred(set->cred);
+ cache->fsuid = cred->fsuid;
+ cache->fsgid = cred->fsgid;
+ cache->group_info = get_group_info(cred->group_info);
cache->mask = set->mask;
/* The above field assignments must be visible
@@ -2812,7 +3039,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
* use rcu_assign_pointer, so just force the memory barrier.
*/
smp_wmb();
- nfs_access_add_rbtree(inode, cache);
+ nfs_access_add_rbtree(inode, cache, cred);
/* Update accounting */
smp_mb__before_atomic();
@@ -2877,7 +3104,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached(inode, cred, &cache, may_block);
+ status = nfs_access_get_cached(inode, cred, &cache.mask, may_block);
if (status == 0)
goto out_cached;
@@ -2888,17 +3115,13 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
/*
* Determine which access bits we want to ask for...
*/
- cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
- if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
- cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
- NFS_ACCESS_XALIST;
- }
+ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND |
+ nfs_access_xattr_mask(NFS_SERVER(inode));
if (S_ISDIR(inode->i_mode))
cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
else
cache.mask |= NFS_ACCESS_EXECUTE;
- cache.cred = cred;
- status = NFS_PROTO(inode)->access(inode, &cache);
+ status = NFS_PROTO(inode)->access(inode, &cache, cred);
if (status != 0) {
if (status == -ESTALE) {
if (!S_ISDIR(inode->i_mode))
@@ -2908,7 +3131,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
}
goto out;
}
- nfs_access_add_cache(inode, &cache);
+ nfs_access_add_cache(inode, &cache, cred);
out_cached:
cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode);
if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2e894fec036b..11c566d8769f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -59,6 +59,7 @@
#include "internal.h"
#include "iostat.h"
#include "pnfs.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -172,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter);
- return nfs_file_direct_write(iocb, iter);
+ return nfs_file_direct_read(iocb, iter, true);
+ return nfs_file_direct_write(iocb, iter, true);
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -275,7 +276,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
}
- dreq->iocb->ki_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res);
}
complete(&dreq->completion);
@@ -424,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
@@ -439,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -481,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (iter_is_iovec(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
- nfs_start_io_direct(inode);
+ if (!swap)
+ nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- nfs_end_io_direct(inode);
+ if (!swap)
+ nfs_end_io_direct(inode);
if (requested > 0) {
result = nfs_direct_wait(dreq);
@@ -620,7 +625,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_unlock_and_release_request(req);
}
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+ if (nfs_commit_end(cinfo.mds))
nfs_direct_write_complete(dreq);
}
@@ -789,7 +794,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
*/
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
struct iov_iter *iter,
- loff_t pos)
+ loff_t pos, int ioflags)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -797,7 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
@@ -875,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
@@ -891,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
ssize_t result, requested;
size_t count;
@@ -905,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- result = generic_write_checks(iocb, iter);
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
count = result;
@@ -936,16 +947,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dreq->iocb = iocb;
pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
- nfs_start_io_direct(inode);
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
- requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- }
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
- nfs_end_io_direct(inode);
+ nfs_end_io_direct(inode);
+ }
if (requested > 0) {
result = nfs_direct_wait(dreq);
@@ -959,6 +976,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
} else {
result = requested;
}
+ nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
out_release:
nfs_direct_req_release(dreq);
out:
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index d772c20bbfd1..01596f2d0a1e 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -64,7 +64,6 @@ static struct dentry *
nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
- struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
@@ -79,7 +78,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (fh_len < len || fh_type != len)
return NULL;
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SB(sb));
if (fattr == NULL) {
dentry = ERR_PTR(-ENOMEM);
goto out;
@@ -95,28 +94,19 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (inode)
goto out_found;
- label = nfs4_label_alloc(NFS_SB(sb), GFP_KERNEL);
- if (IS_ERR(label)) {
- dentry = ERR_CAST(label);
- goto out_free_fattr;
- }
-
rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops;
- ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL);
+ ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, NULL);
if (ret) {
dprintk("%s: getattr failed %d\n", __func__, ret);
trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret);
dentry = ERR_PTR(ret);
- goto out_free_label;
+ goto out_free_fattr;
}
- inode = nfs_fhget(sb, server_fh, fattr, label);
+ inode = nfs_fhget(sb, server_fh, fattr);
out_found:
dentry = d_obtain_alias(inode);
-
-out_free_label:
- nfs4_label_free(label);
out_free_fattr:
nfs_free_fattr(fattr);
out:
@@ -131,7 +121,6 @@ nfs_get_parent(struct dentry *dentry)
struct super_block *sb = inode->i_sb;
struct nfs_server *server = NFS_SB(sb);
struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
struct dentry *parent;
struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops;
struct nfs_fh fh;
@@ -139,31 +128,20 @@ nfs_get_parent(struct dentry *dentry)
if (!ops->lookupp)
return ERR_PTR(-EACCES);
- fattr = nfs_alloc_fattr();
- if (fattr == NULL) {
- parent = ERR_PTR(-ENOMEM);
- goto out;
- }
+ fattr = nfs_alloc_fattr_with_label(server);
+ if (fattr == NULL)
+ return ERR_PTR(-ENOMEM);
- label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(label)) {
- parent = ERR_CAST(label);
- goto out_free_fattr;
- }
-
- ret = ops->lookupp(inode, &fh, fattr, label);
+ ret = ops->lookupp(inode, &fh, fattr);
if (ret) {
parent = ERR_PTR(ret);
- goto out_free_label;
+ goto out;
}
- pinode = nfs_fhget(sb, &fh, fattr, label);
+ pinode = nfs_fhget(sb, &fh, fattr);
parent = d_obtain_alias(pinode);
-out_free_label:
- nfs4_label_free(label);
-out_free_fattr:
- nfs_free_fattr(fattr);
out:
+ nfs_free_fattr(fattr);
return parent;
}
@@ -180,5 +158,5 @@ const struct export_operations nfs_export_ops = {
.fetch_iversion = nfs_fetch_iversion,
.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
- EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS,
+ EXPORT_OP_NOATOMIC_ATTR,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa353fd58240..150b7fa8f0a7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -44,11 +44,6 @@
static const struct vm_operations_struct nfs_file_vm_ops;
-/* Hack for future NFS swap support */
-#ifndef IS_SWAPFILE
-# define IS_SWAPFILE(inode) (0)
-#endif
-
int nfs_check_flags(int flags)
{
if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
@@ -84,6 +79,7 @@ nfs_file_release(struct inode *inode, struct file *filp)
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
nfs_file_clear_open_context(filp);
+ nfs_fscache_release_file(inode, filp);
return 0;
}
EXPORT_SYMBOL_GPL(nfs_file_release);
@@ -161,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_read(iocb, to);
+ return nfs_file_direct_read(iocb, to, false);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -405,18 +401,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void nfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
- page, offset, length);
+ dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
+ folio->index, offset, length);
- if (offset != 0 || length < PAGE_SIZE)
+ if (offset != 0 || length < folio_size(folio))
return;
/* Cancel any unstarted writes on this page */
- nfs_wb_page_cancel(page_file_mapping(page)->host, page);
-
- nfs_fscache_invalidate_page(page, page->mapping->host);
+ nfs_wb_folio_cancel(folio->mapping->host, folio);
+ folio_wait_fscache(folio);
}
/*
@@ -472,16 +467,15 @@ static void nfs_check_dirty_writeback(struct page *page,
* - Caller holds page lock
* - Return 0 if successful, -error otherwise
*/
-static int nfs_launder_page(struct page *page)
+static int nfs_launder_folio(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_inode *nfsi = NFS_I(inode);
+ struct inode *inode = folio->mapping->host;
- dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
- inode->i_ino, (long long)page_offset(page));
+ dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
+ inode->i_ino, folio_pos(folio));
- nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ folio_wait_fscache(folio);
+ return nfs_wb_page(inode, &folio->page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -489,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
- struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
@@ -503,31 +498,39 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
*span = sis->pages;
+
+ if (cl->rpc_ops->enable_swap)
+ cl->rpc_ops->enable_swap(inode);
+
return rpc_clnt_swap_activate(clnt);
}
static void nfs_swap_deactivate(struct file *file)
{
- struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
rpc_clnt_swap_deactivate(clnt);
+ if (cl->rpc_ops->disable_swap)
+ cl->rpc_ops->disable_swap(file_inode(file));
}
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
- .readpages = nfs_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .readahead = nfs_readahead,
+ .dirty_folio = filemap_dirty_folio,
.writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
- .invalidatepage = nfs_invalidate_page,
+ .invalidate_folio = nfs_invalidate_folio,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
- .launder_page = nfs_launder_page,
+ .launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
@@ -555,7 +558,11 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
/* make sure the cache has finished storing the page */
- nfs_fscache_wait_on_page_write(NFS_I(inode), page);
+ if (PageFsCache(page) &&
+ wait_on_page_fscache_killable(vmf->page) < 0) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
nfs_wait_bit_killable, TASK_KILLABLE);
@@ -616,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
return result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_write(iocb, from);
+ return nfs_file_direct_write(iocb, from, false);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, iov_iter_count(from), (long long) iocb->ki_pos);
@@ -639,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
result = generic_write_checks(iocb, from);
if (result > 0) {
current->backing_dev_info = inode_to_bdi(inode);
- result = generic_perform_write(file, from, iocb->ki_pos);
+ result = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
}
nfs_end_io_write(inode);
@@ -843,15 +850,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /*
- * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
- * any standard. In principle we might be able to support LOCK_MAND
- * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
- * NFS code is not set up for it.
- */
- if (fl->fl_type & LOCK_MAND)
- return -EINVAL;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2103852475f..76deddab0a8f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -293,8 +293,6 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs41_sequence_done(task, &hdr->res.seq_res);
@@ -1077,7 +1075,7 @@ filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
- new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
array = pnfs_add_commit_array(fl_cinfo, new, lseg);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 79323b5dab0c..aed0748fd6ec 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr {
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
- struct nfs4_pnfs_ds *ds_list[1];
+ struct nfs4_pnfs_ds *ds_list[];
};
struct nfs4_filelayout_segment {
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 86c3f7e69ec4..acf4b88889dc 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -136,9 +136,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_free_stripe_indices;
}
- dsaddr = kzalloc(sizeof(*dsaddr) +
- (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
- gfp_flags);
+ dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags);
if (!dsaddr)
goto out_err_free_stripe_indices;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index d383de00d486..604be402ae13 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -663,7 +663,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(inode, GFP_KERNEL);
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}
static void
@@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(inode, GFP_NOIO);
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}
static void
@@ -806,13 +806,10 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
bool strict_iomode)
{
pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_READ,
- strict_iomode,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes, IOMODE_READ,
+ strict_iomode, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -894,13 +891,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
retry:
ff_layout_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_RW,
- false,
- GFP_NOFS);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -953,13 +947,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_RW,
- false,
- GFP_NOFS);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -1258,7 +1249,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
- GFP_NOIO);
+ nfs_io_gfp_mask());
switch (status) {
case NFS4ERR_DELAY:
@@ -1414,8 +1405,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1975,7 +1964,8 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
struct inode *inode = lseg->pls_layout->plh_inode;
struct pnfs_commit_array *array, *new;
- new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
array = pnfs_add_commit_array(fl_cinfo, new, lseg);
@@ -2154,10 +2144,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
struct nfs4_flexfile_layoutreturn_args *ff_args;
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
- ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+ ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask());
if (!ff_args)
goto out_nomem;
- ff_args->pages[0] = alloc_page(GFP_KERNEL);
+ ff_args->pages[0] = alloc_page(nfs_io_gfp_mask());
if (!ff_args->pages[0])
goto out_nomem_free;
@@ -2194,8 +2184,8 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
if (list_empty(&head))
return;
- errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
- sizeof(*errors), GFP_NOFS);
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
+ nfs_io_gfp_mask());
if (errors != NULL) {
const struct nfs4_ff_layout_ds_err *pos;
size_t n = 0;
@@ -2446,7 +2436,8 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
- args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
+ nfs_io_gfp_mask());
if (!args->devinfo)
return -ENOMEM;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index c9b61b818ec1..bfa7202ca7be 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -378,10 +378,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
goto noconnect;
ds = mirror->mirror_ds->ds;
+ if (READ_ONCE(ds->ds_clp))
+ goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
- if (ds->ds_clp)
- goto out;
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 0d444a90f513..9a16897e8dc6 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -10,6 +10,7 @@
* Split from fs/nfs/super.c by David Howells <dhowells@redhat.com>
*/
+#include <linux/compat.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
@@ -79,6 +80,7 @@ enum nfs_param {
Opt_source,
Opt_tcp,
Opt_timeo,
+ Opt_trunkdiscovery,
Opt_udp,
Opt_v,
Opt_vers,
@@ -179,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_string("source", Opt_source),
fsparam_flag ("tcp", Opt_tcp),
fsparam_u32 ("timeo", Opt_timeo),
+ fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery),
fsparam_flag ("udp", Opt_udp),
fsparam_flag ("v2", Opt_v),
fsparam_flag ("v3", Opt_v),
@@ -514,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (result.negated)
ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
else
- ctx->flags &= NFS_MOUNT_SOFTREVAL;
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
break;
case Opt_posix:
if (result.negated)
@@ -528,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
else
ctx->flags &= ~NFS_MOUNT_NOCTO;
break;
+ case Opt_trunkdiscovery:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY;
+ else
+ ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
+ break;
case Opt_ac:
if (result.negated)
ctx->flags |= NFS_MOUNT_NOAC;
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
deleted file mode 100644
index 573b1da9342c..000000000000
--- a/fs/nfs/fscache-index.c
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* NFS FS-Cache index structure definition
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/nfs_fs.h>
-#include <linux/nfs_fs_sb.h>
-#include <linux/in6.h>
-#include <linux/iversion.h>
-
-#include "internal.h"
-#include "fscache.h"
-
-#define NFSDBG_FACILITY NFSDBG_FSCACHE
-
-/*
- * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
- * the cookie for the top-level index object for NFS into here. The top-level
- * index can than have other cache objects inserted into it.
- */
-struct fscache_netfs nfs_fscache_netfs = {
- .name = "nfs",
- .version = 0,
-};
-
-/*
- * Register NFS for caching
- */
-int nfs_fscache_register(void)
-{
- return fscache_register_netfs(&nfs_fscache_netfs);
-}
-
-/*
- * Unregister NFS for caching
- */
-void nfs_fscache_unregister(void)
-{
- fscache_unregister_netfs(&nfs_fscache_netfs);
-}
-
-/*
- * Define the server object for FS-Cache. This is used to describe a server
- * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
- * server address parameters.
- */
-const struct fscache_cookie_def nfs_fscache_server_index_def = {
- .name = "NFS.server",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-/*
- * Define the superblock object for FS-Cache. This is used to describe a
- * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
- * parameters that might cause a separate superblock.
- */
-const struct fscache_cookie_def nfs_fscache_super_index_def = {
- .name = "NFS.super",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-/*
- * Consult the netfs about the state of an object
- * - This function can be absent if the index carries no state data
- * - The netfs data from the cookie being used as the target is
- * presented, as is the auxiliary data
- */
-static
-enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
- const void *data,
- uint16_t datalen,
- loff_t object_size)
-{
- struct nfs_fscache_inode_auxdata auxdata;
- struct nfs_inode *nfsi = cookie_netfs_data;
-
- if (datalen != sizeof(auxdata))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
-
- if (memcmp(data, &auxdata, datalen) != 0)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- return FSCACHE_CHECKAUX_OKAY;
-}
-
-/*
- * Get an extra reference on a read context.
- * - This function can be absent if the completion function doesn't require a
- * context.
- * - The read context is passed back to NFS in the event that a data read on the
- * cache fails with EIO - in which case the server must be contacted to
- * retrieve the data, which requires the read context for security.
- */
-static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
-{
- get_nfs_open_context(context);
-}
-
-/*
- * Release an extra reference on a read context.
- * - This function can be absent if the completion function doesn't require a
- * context.
- */
-static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
-{
- if (context)
- put_nfs_open_context(context);
-}
-
-/*
- * Define the inode object for FS-Cache. This is used to describe an inode
- * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
- * an inode.
- *
- * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
- * held in the cache auxiliary data for the data storage object with those in
- * the inode struct in memory.
- */
-const struct fscache_cookie_def nfs_fscache_inode_object_def = {
- .name = "NFS.fh",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = nfs_fscache_inode_check_aux,
- .get_context = nfs_fh_get_context,
- .put_context = nfs_fh_put_context,
-};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index d743629e05e1..f73c09a9cf0a 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -19,27 +19,20 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "nfstrace.h"
-#define NFSDBG_FACILITY NFSDBG_FSCACHE
+#define NFS_MAX_KEY_LEN 1000
-static struct rb_root nfs_fscache_keys = RB_ROOT;
-static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
-
-/*
- * Layout of the key for an NFS server cache object.
- */
-struct nfs_server_key {
- struct {
- uint16_t nfsversion; /* NFS protocol version */
- uint32_t minorversion; /* NFSv4 minor version */
- uint16_t family; /* address family */
- __be16 port; /* IP port */
- } hdr;
- union {
- struct in_addr ipv4_addr; /* IPv4 address */
- struct in6_addr ipv6_addr; /* IPv6 address */
- };
-} __packed;
+static bool nfs_append_int(char *key, int *_len, unsigned long long x)
+{
+ if (*_len > NFS_MAX_KEY_LEN)
+ return false;
+ if (x == 0)
+ key[(*_len)++] = ',';
+ else
+ *_len += sprintf(key + *_len, ",%llx", x);
+ return true;
+}
/*
* Get the per-client index cookie for an NFS client if the appropriate mount
@@ -47,160 +40,106 @@ struct nfs_server_key {
* - We always try and get an index cookie for the client, but get filehandle
* cookies on a per-superblock basis, depending on the mount flags
*/
-void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+static bool nfs_fscache_get_client_key(struct nfs_client *clp,
+ char *key, int *_len)
{
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
- struct nfs_server_key key;
- uint16_t len = sizeof(key.hdr);
- memset(&key, 0, sizeof(key));
- key.hdr.nfsversion = clp->rpc_ops->version;
- key.hdr.minorversion = clp->cl_minorversion;
- key.hdr.family = clp->cl_addr.ss_family;
+ *_len += snprintf(key + *_len, NFS_MAX_KEY_LEN - *_len,
+ ",%u.%u,%x",
+ clp->rpc_ops->version,
+ clp->cl_minorversion,
+ clp->cl_addr.ss_family);
switch (clp->cl_addr.ss_family) {
case AF_INET:
- key.hdr.port = sin->sin_port;
- key.ipv4_addr = sin->sin_addr;
- len += sizeof(key.ipv4_addr);
- break;
+ if (!nfs_append_int(key, _len, sin->sin_port) ||
+ !nfs_append_int(key, _len, sin->sin_addr.s_addr))
+ return false;
+ return true;
case AF_INET6:
- key.hdr.port = sin6->sin6_port;
- key.ipv6_addr = sin6->sin6_addr;
- len += sizeof(key.ipv6_addr);
- break;
+ if (!nfs_append_int(key, _len, sin6->sin6_port) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[0]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[1]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[2]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[3]))
+ return false;
+ return true;
default:
printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
clp->cl_addr.ss_family);
- clp->fscache = NULL;
- return;
+ return false;
}
-
- /* create a cache index for looking up filehandles */
- clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
- &nfs_fscache_server_index_def,
- &key, len,
- NULL, 0,
- clp, 0, true);
- dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
- clp, clp->fscache);
}
/*
- * Dispose of a per-client cookie
- */
-void nfs_fscache_release_client_cookie(struct nfs_client *clp)
-{
- dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
- clp, clp->fscache);
-
- fscache_relinquish_cookie(clp->fscache, NULL, false);
- clp->fscache = NULL;
-}
-
-/*
- * Get the cache cookie for an NFS superblock. We have to handle
- * uniquification here because the cache doesn't do it for us.
+ * Get the cache cookie for an NFS superblock.
*
* The default uniquifier is just an empty string, but it may be overridden
* either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
* superblock across an automount point of some nature.
*/
-void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
+int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
{
- struct nfs_fscache_key *key, *xkey;
+ struct fscache_volume *vcookie;
struct nfs_server *nfss = NFS_SB(sb);
- struct rb_node **p, *parent;
- int diff;
+ unsigned int len = 3;
+ char *key;
- nfss->fscache_key = NULL;
- nfss->fscache = NULL;
- if (!uniq) {
- uniq = "";
- ulen = 1;
+ if (uniq) {
+ nfss->fscache_uniq = kmemdup_nul(uniq, ulen, GFP_KERNEL);
+ if (!nfss->fscache_uniq)
+ return -ENOMEM;
}
- key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+ key = kmalloc(NFS_MAX_KEY_LEN + 24, GFP_KERNEL);
if (!key)
- return;
-
- key->nfs_client = nfss->nfs_client;
- key->key.super.s_flags = sb->s_flags & NFS_SB_MASK;
- key->key.nfs_server.flags = nfss->flags;
- key->key.nfs_server.rsize = nfss->rsize;
- key->key.nfs_server.wsize = nfss->wsize;
- key->key.nfs_server.acregmin = nfss->acregmin;
- key->key.nfs_server.acregmax = nfss->acregmax;
- key->key.nfs_server.acdirmin = nfss->acdirmin;
- key->key.nfs_server.acdirmax = nfss->acdirmax;
- key->key.nfs_server.fsid = nfss->fsid;
- key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
-
- key->key.uniq_len = ulen;
- memcpy(key->key.uniquifier, uniq, ulen);
-
- spin_lock(&nfs_fscache_keys_lock);
- p = &nfs_fscache_keys.rb_node;
- parent = NULL;
- while (*p) {
- parent = *p;
- xkey = rb_entry(parent, struct nfs_fscache_key, node);
-
- if (key->nfs_client < xkey->nfs_client)
- goto go_left;
- if (key->nfs_client > xkey->nfs_client)
- goto go_right;
-
- diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
- if (diff < 0)
- goto go_left;
- if (diff > 0)
- goto go_right;
-
- if (key->key.uniq_len == 0)
- goto non_unique;
- diff = memcmp(key->key.uniquifier,
- xkey->key.uniquifier,
- key->key.uniq_len);
- if (diff < 0)
- goto go_left;
- if (diff > 0)
- goto go_right;
- goto non_unique;
-
- go_left:
- p = &(*p)->rb_left;
- continue;
- go_right:
- p = &(*p)->rb_right;
+ return -ENOMEM;
+
+ memcpy(key, "nfs", 3);
+ if (!nfs_fscache_get_client_key(nfss->nfs_client, key, &len) ||
+ !nfs_append_int(key, &len, nfss->fsid.major) ||
+ !nfs_append_int(key, &len, nfss->fsid.minor) ||
+ !nfs_append_int(key, &len, sb->s_flags & NFS_SB_MASK) ||
+ !nfs_append_int(key, &len, nfss->flags) ||
+ !nfs_append_int(key, &len, nfss->rsize) ||
+ !nfs_append_int(key, &len, nfss->wsize) ||
+ !nfs_append_int(key, &len, nfss->acregmin) ||
+ !nfs_append_int(key, &len, nfss->acregmax) ||
+ !nfs_append_int(key, &len, nfss->acdirmin) ||
+ !nfs_append_int(key, &len, nfss->acdirmax) ||
+ !nfs_append_int(key, &len, nfss->client->cl_auth->au_flavor))
+ goto out;
+
+ if (ulen > 0) {
+ if (ulen > NFS_MAX_KEY_LEN - len)
+ goto out;
+ key[len++] = ',';
+ memcpy(key + len, uniq, ulen);
+ len += ulen;
}
-
- rb_link_node(&key->node, parent, p);
- rb_insert_color(&key->node, &nfs_fscache_keys);
- spin_unlock(&nfs_fscache_keys_lock);
- nfss->fscache_key = key;
+ key[len] = 0;
/* create a cache index for looking up filehandles */
- nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
- &nfs_fscache_super_index_def,
- &key->key,
- sizeof(key->key) + ulen,
- NULL, 0,
- nfss, 0, true);
- dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
- nfss, nfss->fscache);
- return;
-
-non_unique:
- spin_unlock(&nfs_fscache_keys_lock);
+ vcookie = fscache_acquire_volume(key,
+ NULL, /* preferred_cache */
+ NULL, 0 /* coherency_data */);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(key);
+ return PTR_ERR(vcookie);
+ }
+ pr_err("NFS: Cache volume key already in use (%s)\n", key);
+ vcookie = NULL;
+ }
+ nfss->fscache = vcookie;
+
+out:
kfree(key);
- nfss->fscache_key = NULL;
- nfss->fscache = NULL;
- printk(KERN_WARNING "NFS:"
- " Cache request denied due to non-unique superblock keys\n");
+ return 0;
}
/*
@@ -210,32 +149,9 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
{
struct nfs_server *nfss = NFS_SB(sb);
- dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
- nfss, nfss->fscache);
-
- fscache_relinquish_cookie(nfss->fscache, NULL, false);
+ fscache_relinquish_volume(nfss->fscache, NULL, false);
nfss->fscache = NULL;
-
- if (nfss->fscache_key) {
- spin_lock(&nfs_fscache_keys_lock);
- rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
- spin_unlock(&nfs_fscache_keys_lock);
- kfree(nfss->fscache_key);
- nfss->fscache_key = NULL;
- }
-}
-
-static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
- struct nfs_inode *nfsi)
-{
- memset(auxdata, 0, sizeof(*auxdata));
- auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+ kfree(nfss->fscache_uniq);
}
/*
@@ -251,13 +167,15 @@ void nfs_fscache_init_inode(struct inode *inode)
if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
- nfs_fscache_update_auxdata(&auxdata, nfsi);
+ nfs_fscache_update_auxdata(&auxdata, inode);
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
- &nfs_fscache_inode_object_def,
- nfsi->fh.data, nfsi->fh.size,
- &auxdata, sizeof(auxdata),
- nfsi, nfsi->vfs_inode.i_size, false);
+ 0,
+ nfsi->fh.data, /* index_key */
+ nfsi->fh.size,
+ &auxdata, /* aux_data */
+ sizeof(auxdata),
+ i_size_read(inode));
}
/*
@@ -265,24 +183,13 @@ void nfs_fscache_init_inode(struct inode *inode)
*/
void nfs_fscache_clear_inode(struct inode *inode)
{
- struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
- dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
-
- nfs_fscache_update_auxdata(&auxdata, nfsi);
- fscache_relinquish_cookie(cookie, &auxdata, false);
+ fscache_relinquish_cookie(cookie, false);
nfsi->fscache = NULL;
}
-static bool nfs_fscache_can_enable(void *data)
-{
- struct inode *inode = data;
-
- return !inode_is_open_for_write(inode);
-}
-
/*
* Enable or disable caching for a file that is being opened as appropriate.
* The cookie is allocated when the inode is initialised, but is not enabled at
@@ -305,216 +212,138 @@ static bool nfs_fscache_can_enable(void *data)
void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ bool open_for_write = inode_is_open_for_write(inode);
if (!fscache_cookie_valid(cookie))
return;
- nfs_fscache_update_auxdata(&auxdata, nfsi);
-
- if (inode_is_open_for_write(inode)) {
- dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
- clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
- fscache_disable_cookie(cookie, &auxdata, true);
- fscache_uncache_all_inode_pages(cookie, inode);
- } else {
- dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
- fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size,
- nfs_fscache_can_enable, inode);
- if (fscache_cookie_enabled(cookie))
- set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+ fscache_use_cookie(cookie, open_for_write);
+ if (open_for_write) {
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_invalidate(cookie, &auxdata, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
}
}
EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
-/*
- * Release the caching state associated with a page, if the page isn't busy
- * interacting with the cache.
- * - Returns true (can release page) or false (page busy).
- */
-int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+void nfs_fscache_release_file(struct inode *inode, struct file *filp)
{
- if (PageFsCache(page)) {
- struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
-
- BUG_ON(!cookie);
- dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
- cookie, page, NFS_I(page->mapping->host));
-
- if (!fscache_maybe_release_page(cookie, page, gfp))
- return 0;
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
- nfs_inc_fscache_stats(page->mapping->host,
- NFSIOS_FSCACHE_PAGES_UNCACHED);
+ if (fscache_cookie_valid(cookie)) {
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_unuse_cookie(cookie, &auxdata, NULL);
}
-
- return 1;
}
/*
- * Release the caching state associated with a page if undergoing complete page
- * invalidation.
+ * Fallback page reading interface.
*/
-void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+static int fscache_fallback_read_page(struct inode *inode, struct page *page)
{
+ struct netfs_cache_resources cres;
struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
+ int ret;
- BUG_ON(!cookie);
-
- dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
- cookie, page, NFS_I(inode));
-
- fscache_wait_on_page_write(cookie, page);
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
- BUG_ON(!PageLocked(page));
- fscache_uncache_page(cookie, page);
- nfs_inc_fscache_stats(page->mapping->host,
- NFSIOS_FSCACHE_PAGES_UNCACHED);
-}
+ ret = fscache_begin_read_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
-/*
- * Handle completion of a page being read from the cache.
- * - Called in process (keventd) context.
- */
-static void nfs_readpage_from_fscache_complete(struct page *page,
- void *context,
- int error)
-{
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
- page, context, error);
-
- /*
- * If the read completes with an error, mark the page with PG_checked,
- * unlock the page, and let the VM reissue the readpage.
- */
- if (!error)
- SetPageUptodate(page);
- else
- SetPageChecked(page);
- unlock_page(page);
+ ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
+ NULL, NULL);
+ fscache_end_operation(&cres);
+ return ret;
}
/*
- * Retrieve a page from fscache
+ * Fallback page writing interface.
*/
-int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode, struct page *page)
+static int fscache_fallback_write_page(struct inode *inode, struct page *page,
+ bool no_space_allocated_yet)
{
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
+ loff_t start = page_offset(page);
+ size_t len = PAGE_SIZE;
int ret;
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
- nfs_i_fscache(inode), page, page->index, page->flags, inode);
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
- if (PageChecked(page)) {
- ClearPageChecked(page);
- return 1;
- }
-
- ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
- page,
- nfs_readpage_from_fscache_complete,
- ctx,
- GFP_KERNEL);
-
- switch (ret) {
- case 0: /* read BIO submitted (page in fscache) */
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache: BIO submitted\n");
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
+ ret = fscache_begin_write_operation(&cres, cookie);
+ if (ret < 0)
return ret;
- case -ENOBUFS: /* inode not in cache */
- case -ENODATA: /* page not in cache */
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache %d\n", ret);
- return 1;
-
- default:
- dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
- }
+ ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ no_space_allocated_yet);
+ if (ret == 0)
+ ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
+ fscache_end_operation(&cres);
return ret;
}
/*
- * Retrieve a set of pages from fscache
+ * Retrieve a page from fscache
*/
-int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+int __nfs_fscache_read_page(struct inode *inode, struct page *page)
{
- unsigned npages = *nr_pages;
int ret;
- dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
- nfs_i_fscache(inode), npages, inode);
-
- ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
- mapping, pages, nr_pages,
- nfs_readpage_from_fscache_complete,
- ctx,
- mapping_gfp_mask(mapping));
- if (*nr_pages < npages)
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
- npages);
- if (*nr_pages > 0)
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
- *nr_pages);
-
- switch (ret) {
- case 0: /* read submitted to the cache for all pages */
- BUG_ON(!list_empty(pages));
- BUG_ON(*nr_pages != 0);
- dfprintk(FSCACHE,
- "NFS: nfs_getpages_from_fscache: submitted\n");
-
- return ret;
-
- case -ENOBUFS: /* some pages aren't cached and can't be */
- case -ENODATA: /* some pages aren't cached */
- dfprintk(FSCACHE,
- "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
- return 1;
+ trace_nfs_fscache_read_page(inode, page);
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ ret = 1;
+ goto out;
+ }
- default:
- dfprintk(FSCACHE,
- "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
+ ret = fscache_fallback_read_page(inode, page);
+ if (ret < 0) {
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
+ SetPageChecked(page);
+ goto out;
}
+ /* Read completed synchronously */
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
+ SetPageUptodate(page);
+ ret = 0;
+out:
+ trace_nfs_fscache_read_page_exit(inode, page, ret);
return ret;
}
/*
- * Store a newly fetched page in fscache
- * - PG_fscache must be set on the page
+ * Store a newly fetched page in fscache. We can be certain there's no page
+ * stored in the cache as yet otherwise we would've read it from there.
*/
-void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+void __nfs_fscache_write_page(struct inode *inode, struct page *page)
{
int ret;
- dfprintk(FSCACHE,
- "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
- nfs_i_fscache(inode), page, page->index, page->flags, sync);
+ trace_nfs_fscache_write_page(inode, page);
- ret = fscache_write_page(nfs_i_fscache(inode), page,
- inode->i_size, GFP_KERNEL);
- dfprintk(FSCACHE,
- "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
- page, page->index, page->flags, ret);
+ ret = fscache_fallback_write_page(inode, page, true);
if (ret != 0) {
- fscache_uncache_page(nfs_i_fscache(inode), page);
- nfs_inc_fscache_stats(inode,
- NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
} else {
- nfs_inc_fscache_stats(inode,
- NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
}
+ trace_nfs_fscache_write_page_exit(inode, page, ret);
}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6754c8607230..4e980cc04779 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -8,51 +8,16 @@
#ifndef _NFS_FSCACHE_H
#define _NFS_FSCACHE_H
+#include <linux/swap.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/nfs4_mount.h>
#include <linux/fscache.h>
+#include <linux/iversion.h>
#ifdef CONFIG_NFS_FSCACHE
/*
- * set of NFS FS-Cache objects that form a superblock key
- */
-struct nfs_fscache_key {
- struct rb_node node;
- struct nfs_client *nfs_client; /* the server */
-
- /* the elements of the unique key - as used by nfs_compare_super() and
- * nfs_compare_mount_options() to distinguish superblocks */
- struct {
- struct {
- unsigned long s_flags; /* various flags
- * (& NFS_MS_MASK) */
- } super;
-
- struct {
- struct nfs_fsid fsid;
- int flags;
- unsigned int rsize; /* read size */
- unsigned int wsize; /* write size */
- unsigned int acregmin; /* attr cache timeouts */
- unsigned int acregmax;
- unsigned int acdirmin;
- unsigned int acdirmax;
- } nfs_server;
-
- struct {
- rpc_authflavor_t au_flavor;
- } rpc_auth;
-
- /* uniquifier - can be used if nfs_server.flags includes
- * NFS_MOUNT_UNSHARED */
- u8 uniq_len;
- char uniquifier[0];
- } key;
-};
-
-/*
* Definition of the auxiliary data attached to NFS inode storage objects
* within the cache.
*
@@ -70,84 +35,39 @@ struct nfs_fscache_inode_auxdata {
};
/*
- * fscache-index.c
- */
-extern struct fscache_netfs nfs_fscache_netfs;
-extern const struct fscache_cookie_def nfs_fscache_server_index_def;
-extern const struct fscache_cookie_def nfs_fscache_super_index_def;
-extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
-
-extern int nfs_fscache_register(void);
-extern void nfs_fscache_unregister(void);
-
-/*
* fscache.c
*/
-extern void nfs_fscache_get_client_cookie(struct nfs_client *);
-extern void nfs_fscache_release_client_cookie(struct nfs_client *);
-
-extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
+extern int nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
extern void nfs_fscache_release_super_cookie(struct super_block *);
extern void nfs_fscache_init_inode(struct inode *);
extern void nfs_fscache_clear_inode(struct inode *);
extern void nfs_fscache_open_file(struct inode *, struct file *);
+extern void nfs_fscache_release_file(struct inode *, struct file *);
-extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
-extern int nfs_fscache_release_page(struct page *, gfp_t);
+extern int __nfs_fscache_read_page(struct inode *, struct page *);
+extern void __nfs_fscache_write_page(struct inode *, struct page *);
-extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
- struct inode *, struct page *);
-extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
- struct inode *, struct address_space *,
- struct list_head *, unsigned *);
-extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
-
-/*
- * wait for a page to complete writing to the cache
- */
-static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
- struct page *page)
-{
- if (PageFsCache(page))
- fscache_wait_on_page_write(nfsi->fscache, page);
-}
-
-/*
- * release the caching state associated with a page if undergoing complete page
- * invalidation
- */
-static inline void nfs_fscache_invalidate_page(struct page *page,
- struct inode *inode)
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
- if (PageFsCache(page))
- __nfs_fscache_invalidate_page(page, inode);
+ if (PageFsCache(page)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
+ fscache_note_page_release(nfs_i_fscache(page->mapping->host));
+ nfs_inc_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED);
+ }
+ return true;
}
/*
* Retrieve a page from an inode data storage object.
*/
-static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode,
- struct page *page)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
- if (NFS_I(inode)->fscache)
- return __nfs_readpage_from_fscache(ctx, inode, page);
- return -ENOBUFS;
-}
-
-/*
- * Retrieve a set of pages from an inode data storage object.
- */
-static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- if (NFS_I(inode)->fscache)
- return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
- nr_pages);
+ if (nfs_i_fscache(inode))
+ return __nfs_fscache_read_page(inode, page);
return -ENOBUFS;
}
@@ -155,28 +75,39 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
* Store a page newly fetched from the server in an inode data storage object
* in the cache.
*/
-static inline void nfs_readpage_to_fscache(struct inode *inode,
- struct page *page,
- int sync)
+static inline void nfs_fscache_write_page(struct inode *inode,
+ struct page *page)
{
- if (PageFsCache(page))
- __nfs_readpage_to_fscache(inode, page, sync);
+ if (nfs_i_fscache(inode))
+ __nfs_fscache_write_page(inode, page);
}
-/*
- * Invalidate the contents of fscache for this inode. This will not sleep.
- */
-static inline void nfs_fscache_invalidate(struct inode *inode)
+static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
+ struct inode *inode)
{
- fscache_invalidate(NFS_I(inode)->fscache);
+ memset(auxdata, 0, sizeof(*auxdata));
+ auxdata->mtime_sec = inode->i_mtime.tv_sec;
+ auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+ auxdata->ctime_sec = inode->i_ctime.tv_sec;
+ auxdata->ctime_nsec = inode->i_ctime.tv_nsec;
+
+ if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(inode);
}
/*
- * Wait for an object to finish being invalidated.
+ * Invalidate the contents of fscache for this inode. This will not sleep.
*/
-static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
{
- fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfsi->fscache) {
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_invalidate(nfsi->fscache, &auxdata,
+ i_size_read(inode), flags);
+ }
}
/*
@@ -190,48 +121,24 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
}
#else /* CONFIG_NFS_FSCACHE */
-static inline int nfs_fscache_register(void) { return 0; }
-static inline void nfs_fscache_unregister(void) {}
-
-static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
-static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
-
static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
static inline void nfs_fscache_init_inode(struct inode *inode) {}
static inline void nfs_fscache_clear_inode(struct inode *inode) {}
static inline void nfs_fscache_open_file(struct inode *inode,
struct file *filp) {}
+static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
return 1; /* True: may release page */
}
-static inline void nfs_fscache_invalidate_page(struct page *page,
- struct inode *inode) {}
-static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
- struct page *page) {}
-
-static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode,
- struct page *page)
-{
- return -ENOBUFS;
-}
-static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
- struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
return -ENOBUFS;
}
-static inline void nfs_readpage_to_fscache(struct inode *inode,
- struct page *page, int sync) {}
-
-
-static inline void nfs_fscache_invalidate(struct inode *inode) {}
-static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {}
+static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
+static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
{
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 59355c106ece..11ff2b2e060f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -80,31 +80,28 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
goto out;
/* get the actual root for this mount */
- fsinfo.fattr = nfs_alloc_fattr();
+ fsinfo.fattr = nfs_alloc_fattr_with_label(server);
if (fsinfo.fattr == NULL)
goto out_name;
- fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(fsinfo.fattr->label))
- goto out_fattr;
error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
nfs_errorf(fc, "NFS: Couldn't getattr on root");
- goto out_label;
+ goto out_fattr;
}
- inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL);
+ inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
error = PTR_ERR(inode);
nfs_errorf(fc, "NFS: Couldn't get root inode");
- goto out_label;
+ goto out_fattr;
}
error = nfs_superblock_set_dummy_root(s, inode);
if (error != 0)
- goto out_label;
+ goto out_fattr;
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
@@ -115,7 +112,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root dentry failed\n");
error = PTR_ERR(root);
nfs_errorf(fc, "NFS: Couldn't get root dentry");
- goto out_label;
+ goto out_fattr;
}
security_d_instantiate(root, inode);
@@ -151,11 +148,9 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
!(kflags_out & SECURITY_LSM_NATIVE_LABELS))
server->caps &= ~NFS_CAP_SECURITY_LABEL;
- nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
+ nfs_setsecurity(inode, fsinfo.fattr);
error = 0;
-out_label:
- nfs4_label_free(fsinfo.fattr->label);
out_fattr:
nfs_free_fattr(fsinfo.fattr);
out_name:
@@ -165,5 +160,5 @@ out:
error_splat_root:
dput(fc->root);
fc->root = NULL;
- goto out_label;
+ goto out_fattr;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 853213b3a209..b4e46b0ffa2d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -203,17 +203,22 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_XATTR);
flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
- } else if (flags & NFS_INO_REVAL_PAGECACHE)
- flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
+ }
if (!nfs_has_xattr_cache(nfsi))
flags &= ~NFS_INO_INVALID_XATTR;
if (flags & NFS_INO_INVALID_DATA)
- nfs_fscache_invalidate(inode);
- if (inode->i_mapping->nrpages == 0)
- flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
- flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
+ nfs_fscache_invalidate(inode, 0);
+ flags &= ~NFS_INO_REVAL_FORCED;
+
nfsi->cache_validity |= flags;
+
+ if (inode->i_mapping->nrpages == 0)
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
+ NFS_INO_DATA_INVAL_DEFER);
+ else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+ trace_nfs_set_cache_invalid(inode, 0);
}
EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -230,19 +235,17 @@ static void nfs_zap_caches_locked(struct inode *inode)
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = jiffies;
- if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_DATA
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL
- | NFS_INO_INVALID_XATTR
- | NFS_INO_REVAL_PAGECACHE);
- } else
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL
- | NFS_INO_INVALID_XATTR
- | NFS_INO_REVAL_PAGECACHE);
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_DATA |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
nfs_zap_label_cache_locked(nfsi);
}
@@ -350,37 +353,32 @@ static void nfs_clear_label_invalid(struct inode *inode)
spin_unlock(&inode->i_lock);
}
-void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
{
int error;
- if (label == NULL)
+ if (fattr->label == NULL)
return;
if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
- error = security_inode_notifysecctx(inode, label->label,
- label->len);
+ error = security_inode_notifysecctx(inode, fattr->label->label,
+ fattr->label->len);
if (error)
printk(KERN_ERR "%s() %s %d "
"security_inode_notifysecctx() %d\n",
__func__,
- (char *)label->label,
- label->len, error);
+ (char *)fattr->label->label,
+ fattr->label->len, error);
nfs_clear_label_invalid(inode);
}
}
struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
{
- struct nfs4_label *label = NULL;
- int minor_version = server->nfs_client->cl_minorversion;
-
- if (minor_version < 2)
- return label;
+ struct nfs4_label *label;
if (!(server->caps & NFS_CAP_SECURITY_LABEL))
- return label;
+ return NULL;
label = kzalloc(sizeof(struct nfs4_label), flags);
if (label == NULL)
@@ -397,8 +395,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
}
EXPORT_SYMBOL_GPL(nfs4_label_alloc);
#else
-void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
{
}
#endif
@@ -426,12 +423,28 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
return inode;
}
+static void nfs_inode_init_regular(struct nfs_inode *nfsi)
+{
+ atomic_long_set(&nfsi->nrequests, 0);
+ INIT_LIST_HEAD(&nfsi->commit_info.list);
+ atomic_long_set(&nfsi->commit_info.ncommit, 0);
+ atomic_set(&nfsi->commit_info.rpcs_out, 0);
+ mutex_init(&nfsi->commit_mutex);
+}
+
+static void nfs_inode_init_dir(struct nfs_inode *nfsi)
+{
+ nfsi->cache_change_attribute = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ init_rwsem(&nfsi->rmdir_sem);
+}
+
/*
* This is our front-end to iget that looks up inodes by file handle
* instead of inode number.
*/
struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
{
struct nfs_find_desc desc = {
.fh = fh,
@@ -480,10 +493,12 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (S_ISREG(inode->i_mode)) {
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
+ nfs_inode_init_regular(nfsi);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
inode->i_data.a_ops = &nfs_dir_aops;
+ nfs_inode_init_dir(nfsi);
/* Deal with crossing mountpoints */
if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
@@ -509,7 +524,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_uid = make_kuid(&init_user_ns, -2);
inode->i_gid = make_kgid(&init_user_ns, -2);
inode->i_blocks = 0;
- memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
nfsi->write_io = 0;
nfsi->read_io = 0;
@@ -547,8 +561,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_gid = fattr->gid;
else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
- if (nfs_server_capable(inode, NFS_CAP_XATTR))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
@@ -563,7 +575,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
fattr->size != 0)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -632,7 +644,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (S_ISREG(inode->i_mode))
nfs_sync_inode(inode);
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
error = -ENOMEM;
goto out;
@@ -666,6 +678,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
if (err)
goto out;
+ trace_nfs_size_truncate(inode, offset);
i_size_write(inode, offset);
/* Optimisation */
if (offset == 0)
@@ -767,26 +780,32 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
-static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+/*
+ * Don't request help from readdirplus if the file is being written to,
+ * or if attribute caching is turned off
+ */
+static bool nfs_getattr_readdirplus_enable(const struct inode *inode)
{
- struct dentry *parent;
+ return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) &&
+ !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ;
+}
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
- nfs_force_use_readdirplus(d_inode(parent));
- dput(parent);
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+{
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_miss(d_inode(parent));
+ dput(parent);
+ }
}
static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
{
- struct dentry *parent;
-
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
- nfs_advise_use_readdirplus(d_inode(parent));
- dput(parent);
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_hit(d_inode(parent));
+ dput(parent);
+ }
}
static u32 nfs_get_valid_attrmask(struct inode *inode)
@@ -822,6 +841,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
int err = 0;
bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
bool do_update = false;
+ bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode);
trace_nfs_getattr_enter(inode);
@@ -830,17 +850,15 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_INO | STATX_SIZE | STATX_BLOCKS;
if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
- nfs_readdirplus_parent_cache_hit(path->dentry);
+ if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_hit(path->dentry);
goto out_no_revalidate;
}
/* Flush out writes to the server in order to update c/mtime. */
- if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
- S_ISREG(inode->i_mode)) {
- err = filemap_write_and_wait(inode->i_mapping);
- if (err)
- goto out;
- }
+ if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
/*
* We may force a getattr if the user cares about atime.
@@ -883,15 +901,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
if (do_update) {
- /* Update the attribute cache */
- if (!(server->flags & NFS_MOUNT_NOAC))
+ if (readdirplus_enabled)
nfs_readdirplus_parent_cache_miss(path->dentry);
- else
- nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
if (err)
goto out;
- } else
+ } else if (readdirplus_enabled)
nfs_readdirplus_parent_cache_hit(path->dentry);
out_no_revalidate:
/* Only return attributes that were revalidated. */
@@ -937,7 +952,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
res = __nfs_find_lock_context(ctx);
rcu_read_unlock();
if (res == NULL) {
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
if (new == NULL)
return ERR_PTR(-ENOMEM);
nfs_init_lock_context(new);
@@ -1015,7 +1030,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
{
struct nfs_open_context *ctx;
- ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
if (!ctx)
return ERR_PTR(-ENOMEM);
nfs_sb_active(dentry->d_sb);
@@ -1024,7 +1039,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
ctx->cred = get_cred(filp->f_cred);
else
ctx->cred = get_current_cred();
- ctx->ll_cred = NULL;
+ rcu_assign_pointer(ctx->ll_cred, NULL);
ctx->state = NULL;
ctx->mode = f_mode;
ctx->flags = 0;
@@ -1063,7 +1078,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
put_cred(ctx->cred);
dput(ctx->dentry);
nfs_sb_deactive(sb);
- put_rpccred(ctx->ll_cred);
+ put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1));
kfree(ctx->mdsthreshold);
kfree_rcu(ctx, rcu_head);
}
@@ -1165,7 +1180,6 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_fscache_open_file(inode, filp);
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
@@ -1175,7 +1189,6 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
- struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1197,20 +1210,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
status = -ENOMEM;
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL)
goto out;
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(label)) {
- status = PTR_ERR(label);
- goto out;
- }
-
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr,
- label, inode);
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -1227,7 +1233,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
else
nfs_zap_caches(inode);
}
- goto err_out;
+ goto out;
}
status = nfs_refresh_inode(inode, fattr);
@@ -1235,20 +1241,18 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode), status);
- goto err_out;
+ goto out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode));
-err_out:
- nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
trace_nfs_revalidate_inode_exit(inode, status);
@@ -1281,6 +1285,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
{
int ret;
+ nfs_fscache_invalidate(inode, 0);
if (mapping->nrpages != 0) {
if (S_ISREG(inode->i_mode)) {
ret = nfs_sync_mapping(mapping);
@@ -1292,7 +1297,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
return ret;
}
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
- nfs_fscache_wait_on_invalidate(inode);
dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
inode->i_sb->s_id,
@@ -1446,13 +1450,12 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
&& timespec64_equal(&ts, &fattr->pre_mtime)) {
inode->i_mtime = fattr->mtime;
- if (S_ISDIR(inode->i_mode))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
&& !nfs_have_writebacks(inode)) {
+ trace_nfs_size_wcc(inode, fattr->size);
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
}
@@ -1579,18 +1582,37 @@ struct nfs_fattr *nfs_alloc_fattr(void)
{
struct nfs_fattr *fattr;
- fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
- if (fattr != NULL)
+ fattr = kmalloc(sizeof(*fattr), GFP_KERNEL);
+ if (fattr != NULL) {
nfs_fattr_init(fattr);
+ fattr->label = NULL;
+ }
return fattr;
}
EXPORT_SYMBOL_GPL(nfs_alloc_fattr);
+struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server)
+{
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
+
+ if (!fattr)
+ return NULL;
+
+ fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(fattr->label)) {
+ kfree(fattr);
+ return NULL;
+ }
+
+ return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label);
+
struct nfs_fh *nfs_alloc_fhandle(void)
{
struct nfs_fh *fh;
- fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
if (fh != NULL)
fh->size = 0;
return fh;
@@ -1777,8 +1799,10 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_NLINK;
unsigned long cache_validity = NFS_I(inode)->cache_validity;
+ enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type;
- if (!(cache_validity & NFS_INO_INVALID_CHANGE) &&
+ if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED &&
+ !(cache_validity & NFS_INO_INVALID_CHANGE) &&
(cache_validity & check_valid) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0)
@@ -2095,16 +2119,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
+ trace_nfs_size_update(inode, new_isize);
i_size_write(inode, new_isize);
if (!have_writers)
invalid |= NFS_INO_INVALID_DATA;
}
- dprintk("NFS: isize change on server for file %s/%ld "
- "(%Ld to %Ld)\n",
- inode->i_sb->s_id,
- inode->i_ino,
- (long long)cur_isize,
- (long long)new_isize);
}
if (new_isize == 0 &&
!(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED |
@@ -2155,11 +2174,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_OTHER;
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
- if (inode->i_nlink != fattr->nlink) {
- if (S_ISDIR(inode->i_mode))
- invalid |= NFS_INO_INVALID_DATA;
+ if (inode->i_nlink != fattr->nlink)
set_nlink(inode, fattr->nlink);
- }
} else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_NLINK;
@@ -2221,7 +2237,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
struct inode *nfs_alloc_inode(struct super_block *sb)
{
struct nfs_inode *nfsi;
- nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL);
if (!nfsi)
return NULL;
nfsi->flags = 0UL;
@@ -2260,14 +2276,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->open_files);
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
- INIT_LIST_HEAD(&nfsi->commit_info.list);
- atomic_long_set(&nfsi->nrequests, 0);
- atomic_long_set(&nfsi->commit_info.ncommit, 0);
- atomic_set(&nfsi->commit_info.rpcs_out, 0);
- init_rwsem(&nfsi->rmdir_sem);
- mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
- nfsi->cache_change_attribute = 0;
}
static int __init nfs_init_inodecache(void)
@@ -2361,10 +2370,6 @@ static int __init init_nfs_fs(void)
if (err < 0)
goto out9;
- err = nfs_fscache_register();
- if (err < 0)
- goto out8;
-
err = nfsiod_start();
if (err)
goto out7;
@@ -2416,8 +2421,6 @@ out5:
out6:
nfsiod_stop();
out7:
- nfs_fscache_unregister();
-out8:
unregister_pernet_subsys(&nfs_net_ops);
out9:
nfs_sysfs_exit();
@@ -2432,7 +2435,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_readpagecache();
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
- nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
rpc_proc_unregister(&init_net, "nfs");
unregister_nfs_fs();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 66fc936834f2..7eefa16ed381 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
return true;
}
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
/*
* Note: RFC 1813 doesn't limit the number of auth flavors that
* a server can return, so make something up.
@@ -193,7 +203,7 @@ extern void nfs_clients_exit(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
-int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
+int nfs_probe_server(struct nfs_server *, struct nfs_fh *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
@@ -209,6 +219,7 @@ extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct fs_context *);
+extern void nfs4_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
@@ -341,14 +352,6 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
return dst;
}
-static inline void nfs4_label_free(struct nfs4_label *label)
-{
- if (label) {
- kfree(label->label);
- kfree(label);
- }
- return;
-}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
@@ -357,7 +360,6 @@ static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
}
#else
static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
-static inline void nfs4_label_free(void *label) {}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
}
@@ -374,13 +376,14 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
/* dir.c */
-extern void nfs_advise_use_readdirplus(struct inode *dir);
-extern void nfs_force_use_readdirplus(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
struct shrink_control *sc);
struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
+void nfs_d_prune_case_insensitive_aliases(struct inode *inode);
int nfs_create(struct user_namespace *, struct inode *, struct dentry *,
umode_t, bool);
int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *,
@@ -395,6 +398,20 @@ int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t,
int nfs_rename(struct user_namespace *, struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
+#ifdef CONFIG_NFS_V4_2
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ if (!(server->caps & NFS_CAP_XATTR))
+ return 0;
+ return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST;
+}
+#else
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ return 0;
+}
+#endif
+
/* file.c */
int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
@@ -580,6 +597,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
}
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index bc0c698f3350..3295af4110f1 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -308,8 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
/* Look it up again to get its attributes */
err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
- ctx->mntfh, ctx->clone_data.fattr,
- NULL);
+ ctx->mntfh, ctx->clone_data.fattr);
dput(parent);
if (err != 0)
return err;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7fba7711e6b3..05c3b4b2b3dd 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -949,13 +949,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
* RFC 1094. This implementation assumes that it's an XDR uint32.
*/
- entry->prev_cookie = entry->cookie;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
return -EAGAIN;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f7524310ddf4..1597eef40d54 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -100,8 +100,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -193,8 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
static int
nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
unsigned short task_flags = 0;
@@ -209,7 +207,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
}
static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
const char dotdot[] = "..";
const size_t len = strlen(dotdot);
@@ -222,7 +220,8 @@ static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
task_flags);
}
-static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
{
struct nfs3_accessargs arg = {
.fh = NFS_FH(inode),
@@ -233,7 +232,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
.rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
.rpc_argp = &arg,
.rpc_resp = &res,
- .rpc_cred = entry->cred,
+ .rpc_cred = cred,
};
int status = -ENOMEM;
@@ -323,7 +322,7 @@ nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata
if (status != 0)
return ERR_PTR(status);
- return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
}
static void nfs3_free_createdata(struct nfs3_createdata *data)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6eca1d7481b..3b0b650c9c5a 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1261,6 +1261,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
static void encode_readdirplus3args(struct xdr_stream *xdr,
const struct nfs3_readdirargs *args)
{
+ uint32_t dircount = args->count;
+ uint32_t maxcount = args->count;
__be32 *p;
encode_nfs_fh3(xdr, args->fh);
@@ -1273,9 +1275,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr,
* readdirplus: need dircount + buffer size.
* We just make sure we make dircount big enough
*/
- *p++ = cpu_to_be32(args->count >> 3);
-
- *p = cpu_to_be32(args->count);
+ *p++ = cpu_to_be32(dircount);
+ *p = cpu_to_be32(maxcount);
}
static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
@@ -1967,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
bool plus)
{
struct user_namespace *userns = rpc_userns(entry->server->client);
- struct nfs_entry old = *entry;
__be32 *p;
int error;
u64 new_cookie;
@@ -1987,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_fileid3(xdr, &entry->ino);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
- return error;
+ return -EAGAIN;
entry->d_type = DT_UNKNOWN;
@@ -2003,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
error = decode_post_op_attr(xdr, entry->fattr, userns);
if (unlikely(error))
- return error;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
@@ -2018,24 +2018,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
- if (unlikely(error)) {
- if (error == -E2BIG)
- goto out_truncated;
- return error;
- }
+ if (unlikely(error))
+ return -EAGAIN;
} else
zero_nfs_fh3(entry->fh);
}
- entry->prev_cookie = entry->cookie;
entry->cookie = new_cookie;
return 0;
-
-out_truncated:
- dprintk("NFS: directory entry contains invalid file handle\n");
- *entry = old;
- return -EAGAIN;
}
/*
@@ -2227,7 +2218,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
- result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
+ result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ result->xattr_support = 0;
return 0;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index a24349512ffe..068c45b3bc1a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -46,7 +46,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
{
struct inode *inode = file_inode(filep);
struct nfs_server *server = NFS_SERVER(inode);
- u32 bitmask[3];
+ u32 bitmask[NFS_BITMASK_SZ];
struct nfs42_falloc_args args = {
.falloc_fh = NFS_FH(inode),
.falloc_offset = offset,
@@ -69,9 +69,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
return status;
}
- memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask));
- if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)
- bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+ nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode,
+ NFS_INO_INVALID_BLOCKS);
res.falloc_fattr = nfs_alloc_fattr();
if (!res.falloc_fattr)
@@ -83,6 +82,10 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
status = nfs_post_op_update_inode_force_wcc(inode,
res.falloc_fattr);
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE])
+ trace_nfs4_fallocate(inode, &args, status);
+ else
+ trace_nfs4_deallocate(inode, &args, status);
kfree(res.falloc_fattr);
return status;
}
@@ -172,28 +175,27 @@ static int handle_async_copy(struct nfs42_copy_res *res,
nfs4_stateid *src_stateid,
bool *restart)
{
- struct nfs4_copy_state *copy, *tmp_copy;
+ struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
int status = NFS4_OK;
- bool found_pending = false;
struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
struct nfs_open_context *src_ctx = nfs_file_open_context(src);
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
return -ENOMEM;
spin_lock(&dst_server->nfs_client->cl_lock);
- list_for_each_entry(tmp_copy,
+ list_for_each_entry(iter,
&dst_server->nfs_client->pending_cb_stateids,
copies) {
- if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
+ if (memcmp(&res->write_res.stateid, &iter->stateid,
NFS4_STATEID_SIZE))
continue;
- found_pending = true;
- list_del(&tmp_copy->copies);
+ tmp_copy = iter;
+ list_del(&iter->copies);
break;
}
- if (found_pending) {
+ if (tmp_copy) {
spin_unlock(&dst_server->nfs_client->cl_lock);
kfree(copy);
copy = tmp_copy;
@@ -251,7 +253,7 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst,
struct nfs_commitres cres;
int status = -ENOMEM;
- cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
if (!cres.verf)
goto out;
@@ -285,7 +287,9 @@ static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
loff_t newsize = pos + len;
loff_t end = newsize - 1;
- truncate_pagecache_range(inode, pos, end);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+
spin_lock(&inode->i_lock);
if (newsize > i_size_read(inode))
i_size_write(inode, newsize);
@@ -352,7 +356,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
res->commit_res.verf = NULL;
if (args->sync) {
res->commit_res.verf =
- kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
if (!res->commit_res.verf)
return -ENOMEM;
}
@@ -363,6 +367,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
status = nfs4_call_sync(dst_server->client, dst_server, &msg,
&args->seq_args, &res->seq_res, 0);
+ trace_nfs4_copy(src_inode, dst_inode, args, res, nss, status);
if (status == -ENOTSUPP)
dst_server->caps &= ~NFS_CAP_COPY;
if (status)
@@ -504,6 +509,7 @@ static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
{
struct nfs42_offloadcancel_data *data = calldata;
+ trace_nfs4_offload_cancel(&data->args, task->tk_status);
nfs41_sequence_done(task, &data->res.osr_seq_res);
if (task->tk_status &&
nfs4_async_handle_error(task, data->seq_server, NULL,
@@ -545,7 +551,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
return -EOPNOTSUPP;
- data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS);
+ data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -584,8 +590,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
ctx = get_nfs_open_context(nfs_file_open_context(src));
l_ctx = nfs_get_lock_context(ctx);
- if (IS_ERR(l_ctx))
- return PTR_ERR(l_ctx);
+ if (IS_ERR(l_ctx)) {
+ status = PTR_ERR(l_ctx);
+ goto out;
+ }
status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
FMODE_READ);
@@ -593,14 +601,16 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
if (status) {
if (status == -EAGAIN)
status = -NFS4ERR_BAD_STATEID;
- return status;
+ goto out;
}
status = nfs4_call_sync(src_server->client, src_server, &msg,
&args->cna_seq_args, &res->cnr_seq_res, 0);
+ trace_nfs4_copy_notify(file_inode(src), args, res, status);
if (status == -ENOTSUPP)
src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
+out:
put_nfs_open_context(nfs_file_open_context(src));
return status;
}
@@ -618,7 +628,7 @@ int nfs42_proc_copy_notify(struct file *src, struct file *dst,
if (!(src_server->caps & NFS_CAP_COPY_NOTIFY))
return -EOPNOTSUPP;
- args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS);
+ args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL);
if (args == NULL)
return -ENOMEM;
@@ -678,6 +688,7 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
+ trace_nfs4_llseek(inode, &args, &res, status);
if (status == -ENOTSUPP)
server->caps &= ~NFS_CAP_SEEK;
if (status)
@@ -1005,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
return -EOPNOTSUPP;
if (n > NFS42_LAYOUTERROR_MAX)
return -EINVAL;
- data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask());
if (!data)
return -ENOMEM;
for (i = 0; i < n; i++) {
@@ -1034,13 +1045,14 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct inode *src_inode = file_inode(src_f);
struct inode *dst_inode = file_inode(dst_f);
struct nfs_server *server = NFS_SERVER(dst_inode);
+ __u32 dst_bitmask[NFS_BITMASK_SZ];
struct nfs42_clone_args args = {
.src_fh = NFS_FH(src_inode),
.dst_fh = NFS_FH(dst_inode),
.src_offset = src_offset,
.dst_offset = dst_offset,
.count = count,
- .dst_bitmask = server->cache_consistency_bitmask,
+ .dst_bitmask = dst_bitmask,
};
struct nfs42_clone_res res = {
.server = server,
@@ -1069,8 +1081,12 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
if (!res.dst_fattr)
return -ENOMEM;
+ nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask,
+ dst_inode, NFS_INO_INVALID_BLOCKS);
+
status = nfs4_call_sync(server->client, server, msg,
&args.seq_args, &res.seq_res, 0);
+ trace_nfs4_clone(src_inode, dst_inode, &args, status);
if (status == 0) {
nfs42_copy_dest_done(dst_inode, dst_offset, count);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 1c4d2a05b401..e7b34f7e0614 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -199,7 +199,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
flags = NFS4_XATTR_ENTRY_EXTVAL;
}
- buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ buf = kmalloc(alloclen, GFP_KERNEL);
if (buf == NULL)
return NULL;
entry = (struct nfs4_xattr_entry *)buf;
@@ -213,7 +213,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
- valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ valp = kvmalloc(len, GFP_KERNEL);
if (valp == NULL) {
kfree(buf);
return NULL;
@@ -289,8 +289,7 @@ nfs4_xattr_alloc_cache(void)
{
struct nfs4_xattr_cache *cache;
- cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
- GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL);
if (cache == NULL)
return NULL;
@@ -998,7 +997,7 @@ int __init nfs4_xattr_cache_init(void)
nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
sizeof(struct nfs4_xattr_cache), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
nfs4_xattr_cache_init_once);
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c8bad735e4c1..271e5f92ed01 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1434,8 +1434,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
status = decode_clone(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->dst_fattr, res->server);
-
+ decode_getfattr(xdr, res->dst_fattr, res->server);
out:
res->rpc_status = status;
return status;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba78df4b13d9..79df6e83881b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,6 +42,7 @@ enum nfs4_client_state {
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
NFS4CLNT_RUN_MANAGER,
+ NFS4CLNT_MANAGER_AVAILABLE,
NFS4CLNT_RECALL_RUNNING,
NFS4CLNT_RECALL_ANY_LAYOUT_READ,
NFS4CLNT_RECALL_ANY_LAYOUT_RW,
@@ -234,7 +235,6 @@ struct nfs4_opendata {
struct nfs4_string group_name;
struct nfs4_label *a_label;
struct nfs_fattr f_attr;
- struct nfs4_label *f_label;
struct dentry *dir;
struct dentry *dentry;
struct nfs4_state_owner *owner;
@@ -261,8 +261,8 @@ struct nfs4_state_maintenance_ops {
};
struct nfs4_mig_recovery_ops {
- int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
- struct page *, const struct cred *);
+ int (*get_locations)(struct nfs_server *, struct nfs_fh *,
+ struct nfs4_fs_locations *, struct page *, const struct cred *);
int (*fsid_present)(struct inode *, const struct cred *);
};
@@ -281,7 +281,8 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
int nfs4_submount(struct fs_context *, struct nfs_server *);
int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
-
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net, int port);
/* nfs4proc.c */
extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
extern int nfs4_async_handle_error(struct rpc_task *task,
@@ -303,8 +304,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
struct nfs4_fs_locations *, struct page *);
-extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
- struct page *page, const struct cred *);
+extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *,
+ struct nfs4_fs_locations *,
+ struct page *page, const struct cred *);
extern int nfs4_proc_fsid_present(struct inode *, const struct cred *);
extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
struct dentry *,
@@ -316,9 +318,10 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_open_context *ctx,
const struct nfs_lock_context *l_ctx,
fmode_t fmode);
+extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
+ struct inode *inode, unsigned long cache_validity);
extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode);
+ struct nfs_fattr *fattr, struct inode *inode);
extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *deleg_stateid,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index af57332503be..47a6cf892c95 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1059,31 +1059,15 @@ static void nfs4_session_limit_xasize(struct nfs_server *server)
#endif
}
-static int nfs4_server_common_setup(struct nfs_server *server,
- struct nfs_fh *mntfh, bool auth_probe)
+void nfs4_server_set_init_caps(struct nfs_server *server)
{
- struct nfs_fattr *fattr;
- int error;
-
- /* data servers support only a subset of NFSv4.1 */
- if (is_ds_only_client(server->nfs_client))
- return -EPROTONOSUPPORT;
-
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- return -ENOMEM;
-
- /* We must ensure the session is initialised first */
- error = nfs4_init_session(server->nfs_client);
- if (error < 0)
- goto out;
-
/* Set the basic capabilities */
server->caps |= server->nfs_client->cl_mvops->init_caps;
if (server->flags & NFS_MOUNT_NORDIRPLUS)
server->caps &= ~NFS_CAP_READDIRPLUS;
if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
server->caps &= ~NFS_CAP_READ_PLUS;
+
/*
* Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
* authentication.
@@ -1091,7 +1075,23 @@ static int nfs4_server_common_setup(struct nfs_server *server,
if (nfs4_disable_idmapping &&
server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
server->caps |= NFS_CAP_UIDGID_NOMAP;
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh, bool auth_probe)
+{
+ int error;
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server->nfs_client);
+ if (error < 0)
+ goto out;
+
+ nfs4_server_set_init_caps(server);
/* Probe the root fh to retrieve its FSID and filehandle */
error = nfs4_get_rootfh(server, mntfh, auth_probe);
@@ -1103,7 +1103,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
(unsigned long long) server->fsid.minor);
nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
- error = nfs_probe_fsinfo(server, mntfh, fattr);
+ error = nfs_probe_server(server, mntfh);
if (error < 0)
goto out;
@@ -1117,7 +1117,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
server->mount_time = jiffies;
server->destroy = nfs4_destroy_server;
out:
- nfs_free_fattr(fattr);
return error;
}
@@ -1288,30 +1287,6 @@ error:
return ERR_PTR(error);
}
-/*
- * Grab the destination's particulars, including lease expiry time.
- *
- * Returns zero if probe succeeded and retrieved FSID matches the FSID
- * we have cached.
- */
-static int nfs_probe_destination(struct nfs_server *server)
-{
- struct inode *inode = d_inode(server->super->s_root);
- struct nfs_fattr *fattr;
- int error;
-
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- return -ENOMEM;
-
- /* Sanity: the probe won't work if the destination server
- * does not recognize the migrated FH. */
- error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
-
- nfs_free_fattr(fattr);
- return error;
-}
-
/**
* nfs4_update_server - Move an nfs_server to a different nfs_client
*
@@ -1368,9 +1343,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
}
nfs_put_client(clp);
- if (server->nfs_client->cl_hostname == NULL)
+ if (server->nfs_client->cl_hostname == NULL) {
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+ if (server->nfs_client->cl_hostname == NULL)
+ return -ENOMEM;
+ }
nfs_server_insert_lists(server);
- return nfs_probe_destination(server);
+ return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root)));
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c91565227ea2..7b861e4f0533 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
+ fmode_t f_mode;
struct iattr attr;
int err;
@@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+ f_mode = filp->f_mode;
if ((openflags & O_ACCMODE) == 3)
- return nfs_open(inode, filp);
+ f_mode |= flags_to_mode(openflags);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
@@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -165,7 +167,7 @@ retry:
if (sync)
return -EOPNOTSUPP;
cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
- GFP_NOFS);
+ GFP_KERNEL);
if (unlikely(cn_resp == NULL))
return -ENOMEM;
@@ -180,8 +182,8 @@ retry:
ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count,
nss, cnrs, sync);
out:
- if (!nfs42_files_from_same_server(file_in, file_out))
- kfree(cn_resp);
+ kfree(cn_resp);
+
if (ret == -EAGAIN)
goto retry;
return ret;
@@ -317,7 +319,7 @@ static int read_name_gen = 1;
static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh, nfs4_stateid *stateid)
{
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
struct file *filep, *res;
struct nfs_server *server;
struct inode *r_ino = NULL;
@@ -328,9 +330,10 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
- nfs_fattr_init(&fattr);
+ if (!fattr)
+ return ERR_PTR(-ENOMEM);
- status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL);
+ status = nfs4_proc_getattr(server, src_fh, fattr, NULL);
if (status < 0) {
res = ERR_PTR(status);
goto out;
@@ -338,25 +341,23 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
res = ERR_PTR(-ENOMEM);
len = strlen(SSC_READ_NAME_BODY) + 16;
- read_name = kzalloc(len, GFP_NOFS);
+ read_name = kzalloc(len, GFP_KERNEL);
if (read_name == NULL)
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
- r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr,
- NULL);
+ r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr);
if (IS_ERR(r_ino)) {
res = ERR_CAST(r_ino);
goto out_free_name;
}
- filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, FMODE_READ,
+ filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, O_RDONLY,
r_ino->i_fop);
if (IS_ERR(filep)) {
res = ERR_CAST(filep);
goto out_free_name;
}
- filep->f_mode |= FMODE_READ;
ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
filep);
@@ -388,6 +389,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
out_free_name:
kfree(read_name);
out:
+ nfs_free_fattr(fattr);
return res;
out_stateowner:
nfs4_put_state_owner(sp);
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 8d8aba305ecc..f331866dd418 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -487,7 +487,7 @@ nfs_idmap_new(struct nfs_client *clp)
err_destroy_pipe:
rpc_destroy_pipe_data(idmap->idmap_pipe);
err:
- get_user_ns(idmap->user_ns);
+ put_user_ns(idmap->user_ns);
kfree(idmap);
return error;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 873342308dc0..3680c8da510c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -164,16 +164,21 @@ static int nfs4_validate_fspath(struct dentry *dentry,
return 0;
}
-static size_t nfs_parse_server_name(char *string, size_t len,
- struct sockaddr *sa, size_t salen, struct net *net)
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net, int port)
{
ssize_t ret;
ret = rpc_pton(net, string, len, sa, salen);
if (ret == 0) {
- ret = nfs_dns_resolve_name(net, string, len, sa, salen);
- if (ret < 0)
- ret = 0;
+ ret = rpc_uaddr2sockaddr(net, string, len, sa, salen);
+ if (ret == 0) {
+ ret = nfs_dns_resolve_name(net, string, len, sa, salen);
+ if (ret < 0)
+ ret = 0;
+ }
+ } else if (port) {
+ rpc_set_port(sa, port);
}
return ret;
}
@@ -328,7 +333,7 @@ static int try_location(struct fs_context *fc,
nfs_parse_server_name(buf->data, buf->len,
&ctx->nfs_server.address,
sizeof(ctx->nfs_server._address),
- fc->net_ns);
+ fc->net_ns, 0);
if (ctx->nfs_server.addrlen == 0)
continue;
@@ -496,7 +501,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
continue;
salen = nfs_parse_server_name(buf->data, buf->len,
- sap, addr_bufsize, net);
+ sap, addr_bufsize, net, 0);
if (salen == 0)
continue;
rpc_set_port(sap, NFS_PORT);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1214bb6b7ee..a79f66432bd3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -93,11 +93,11 @@ struct nfs4_opendata;
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode);
static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs_open_context *ctx, struct nfs4_label *ilabel,
- struct nfs4_label *olabel);
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel);
#ifdef CONFIG_NFS_V4_1
static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
const struct cred *cred,
@@ -108,10 +108,6 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
const struct cred *, bool);
#endif
-static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ],
- const __u32 *src, struct inode *inode,
- struct nfs_server *server,
- struct nfs4_label *label);
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static inline struct nfs4_label *
@@ -127,7 +123,8 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
return NULL;
err = security_dentry_init_security(dentry, sattr->ia_mode,
- &dentry->d_name, (void **)&label->label, &label->len);
+ &dentry->d_name, NULL,
+ (void **)&label->label, &label->len);
if (err == 0)
return label;
@@ -366,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -1232,8 +1237,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
- NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR |
- NFS_INO_REVAL_PAGECACHE;
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
}
nfsi->attrtimeo_timestamp = jiffies;
@@ -1329,7 +1333,6 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
static void nfs4_init_opendata_res(struct nfs4_opendata *p)
{
p->o_res.f_attr = &p->f_attr;
- p->o_res.f_label = p->f_label;
p->o_res.seqid = p->o_arg.seqid;
p->c_res.seqid = p->c_arg.seqid;
p->o_res.server = p->o_arg.server;
@@ -1355,8 +1358,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (p == NULL)
goto err;
- p->f_label = nfs4_label_alloc(server, gfp_mask);
- if (IS_ERR(p->f_label))
+ p->f_attr.label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_attr.label))
goto err_free_p;
p->a_label = nfs4_label_alloc(server, gfp_mask);
@@ -1388,27 +1391,17 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
sizeof(p->o_arg.u.verifier.data));
}
}
- /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
- * will return permission denied for all bits until close */
- if (!(flags & O_EXCL)) {
- /* ask server to check for all possible rights as results
- * are cached */
- switch (p->o_arg.claim) {
- default:
- break;
- case NFS4_OPEN_CLAIM_NULL:
- case NFS4_OPEN_CLAIM_FH:
- p->o_arg.access = NFS4_ACCESS_READ |
- NFS4_ACCESS_MODIFY |
- NFS4_ACCESS_EXTEND |
- NFS4_ACCESS_EXECUTE;
-#ifdef CONFIG_NFS_V4_2
- if (server->caps & NFS_CAP_XATTR)
- p->o_arg.access |= NFS4_ACCESS_XAREAD |
- NFS4_ACCESS_XAWRITE |
- NFS4_ACCESS_XALIST;
-#endif
- }
+ /* ask server to check for all possible rights as results
+ * are cached */
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE |
+ NFS4_ACCESS_EXECUTE |
+ nfs_access_xattr_mask(server);
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -1439,7 +1432,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
err_free_label:
nfs4_label_free(p->a_label);
err_free_f:
- nfs4_label_free(p->f_label);
+ nfs4_label_free(p->f_attr.label);
err_free_p:
kfree(p);
err:
@@ -1461,7 +1454,7 @@ static void nfs4_opendata_free(struct kref *kref)
nfs4_put_state_owner(p->owner);
nfs4_label_free(p->a_label);
- nfs4_label_free(p->f_label);
+ nfs4_label_free(p->f_attr.label);
dput(p->dir);
dput(p->dentry);
@@ -1609,15 +1602,16 @@ static bool nfs_stateid_is_sequential(struct nfs4_state *state,
{
if (test_bit(NFS_OPEN_STATE, &state->flags)) {
/* The common case - we're updating to a new sequence number */
- if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
- nfs4_stateid_is_next(&state->open_stateid, stateid)) {
- return true;
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (nfs4_stateid_is_next(&state->open_stateid, stateid))
+ return true;
+ return false;
}
- } else {
- /* This is the first OPEN in this generation */
- if (stateid->seqid == cpu_to_be32(1))
- return true;
+ /* The server returned a new stateid */
}
+ /* This is the first OPEN in this generation */
+ if (stateid->seqid == cpu_to_be32(1))
+ return true;
return false;
}
@@ -2013,7 +2007,7 @@ nfs4_opendata_get_inode(struct nfs4_opendata *data)
if (!(data->f_attr.valid & NFS_ATTR_FATTR))
return ERR_PTR(-EAGAIN);
inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
- &data->f_attr, data->f_label);
+ &data->f_attr);
break;
default:
inode = d_inode(data->dentry);
@@ -2472,11 +2466,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
/* Set the create mode (note dependency on the session type) */
data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
if (data->o_arg.open_flags & O_EXCL) {
- data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
- if (nfs4_has_persistent_session(clp))
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+ if (clp->cl_mvops->minor_version == 0) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+ /* don't put an ACCESS op in OPEN compound if O_EXCL,
+ * because ACCESS will return permission denied for
+ * all bits until close */
+ data->o_res.access_request = data->o_arg.access = 0;
+ } else if (nfs4_has_persistent_session(clp))
data->o_arg.createmode = NFS4_CREATE_GUARDED;
- else if (clp->cl_mvops->minor_version > 0)
- data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
}
return;
unlock_no_action:
@@ -2653,9 +2651,8 @@ static int nfs4_opendata_access(const struct cred *cred,
} else if ((fmode & FMODE_READ) && !opendata->file_created)
mask = NFS4_ACCESS_READ;
- cache.cred = cred;
nfs_access_set_mask(&cache, opendata->o_res.access_result);
- nfs_access_add_cache(state->inode, &cache);
+ nfs_access_add_cache(state->inode, &cache, cred);
flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP;
if ((mask & ~cache.mask & flags) == 0)
@@ -2708,8 +2705,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
nfs4_sequence_free_slot(&o_res->seq_res);
- nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr,
- o_res->f_label, NULL);
+ nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL);
}
return 0;
}
@@ -3057,6 +3053,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED)
+ set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags);
dentry = opendata->dentry;
if (d_really_is_negative(dentry)) {
@@ -3125,7 +3123,6 @@ static int _nfs4_do_open(struct inode *dir,
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
struct iattr *sattr = c->sattr;
struct nfs4_label *label = c->label;
- struct nfs4_label *olabel = NULL;
int status;
/* Protect against reboot recovery conflicts */
@@ -3148,19 +3145,11 @@ static int _nfs4_do_open(struct inode *dir,
if (opendata == NULL)
goto err_put_state_owner;
- if (label) {
- olabel = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(olabel)) {
- status = PTR_ERR(olabel);
- goto err_opendata_put;
- }
- }
-
if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
if (!opendata->f_attr.mdsthreshold) {
opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
if (!opendata->f_attr.mdsthreshold)
- goto err_free_label;
+ goto err_opendata_put;
}
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
@@ -3169,7 +3158,7 @@ static int _nfs4_do_open(struct inode *dir,
status = _nfs4_open_and_get_state(opendata, flags, ctx);
if (status != 0)
- goto err_free_label;
+ goto err_opendata_put;
state = ctx->state;
if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
@@ -3186,11 +3175,11 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- ctx, label, olabel);
+ ctx, label);
if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr,
opendata->o_res.f_attr);
- nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr);
}
sattr->ia_valid = ia_old;
}
@@ -3203,13 +3192,9 @@ static int _nfs4_do_open(struct inode *dir,
opendata->f_attr.mdsthreshold = NULL;
}
- nfs4_label_free(olabel);
-
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
return 0;
-err_free_label:
- nfs4_label_free(olabel);
err_opendata_put:
nfs4_opendata_put(opendata);
err_put_state_owner:
@@ -3354,8 +3339,7 @@ zero_stateid:
static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs_open_context *ctx, struct nfs4_label *ilabel,
- struct nfs4_label *olabel)
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel)
{
struct nfs_server *server = NFS_SERVER(inode);
__u32 bitmask[NFS4_BITMASK_SZ];
@@ -3369,7 +3353,6 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
};
struct nfs_setattrres res = {
.fattr = fattr,
- .label = olabel,
.server = server,
};
struct nfs4_exception exception = {
@@ -3386,7 +3369,7 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
adjust_flags |= NFS_INO_INVALID_OTHER;
do {
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, olabel),
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label),
inode, adjust_flags);
err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
@@ -3561,7 +3544,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
.stateid = &calldata->arg.stateid,
};
- dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
@@ -3616,7 +3598,7 @@ out_release:
task->tk_status = 0;
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, &calldata->fattr);
- dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+ dprintk("%s: ret = %d\n", __func__, task->tk_status);
return;
out_restart:
task->tk_status = 0;
@@ -3634,7 +3616,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
bool is_rdonly, is_wronly, is_rdwr;
int call_close = 0;
- dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
@@ -3688,7 +3669,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (!nfs4_have_delegation(inode, FMODE_READ)) {
nfs4_bitmask_set(calldata->arg.bitmask_store,
server->cache_consistency_bitmask,
- inode, server, NULL);
+ inode, 0);
calldata->arg.bitmask = calldata->arg.bitmask_store;
} else
calldata->arg.bitmask = NULL;
@@ -3708,7 +3689,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
&calldata->res.seq_res,
task) != 0)
nfs_release_seqid(calldata->arg.seqid);
- dprintk("%s: done!\n", __func__);
return;
out_no_action:
task->tk_action = NULL;
@@ -3860,7 +3840,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
FATTR4_WORD0_FH_EXPIRE_TYPE |
FATTR4_WORD0_LINK_SUPPORT |
FATTR4_WORD0_SYMLINK_SUPPORT |
- FATTR4_WORD0_ACLSUPPORT;
+ FATTR4_WORD0_ACLSUPPORT |
+ FATTR4_WORD0_CASE_INSENSITIVE |
+ FATTR4_WORD0_CASE_PRESERVING;
if (minorversion)
bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
@@ -3889,10 +3871,16 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+ if (res.case_insensitive)
+ server->caps |= NFS_CAP_CASE_INSENSITIVE;
+ if (res.case_preserving)
+ server->caps |= NFS_CAP_CASE_PRESERVING;
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
server->caps |= NFS_CAP_SECURITY_LABEL;
#endif
+ if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS)
+ server->caps |= NFS_CAP_FS_LOCATIONS;
if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
@@ -3941,6 +3929,8 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
.interruptible = true,
};
int err;
+
+ nfs4_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
@@ -3949,6 +3939,114 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
return err;
}
+static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
+ struct nfs_client *clp,
+ struct nfs_server *server)
+{
+ int i;
+
+ for (i = 0; i < location->nservers; i++) {
+ struct nfs4_string *srv_loc = &location->servers[i];
+ struct sockaddr addr;
+ size_t addrlen;
+ struct xprt_create xprt_args = {
+ .ident = 0,
+ .net = clp->cl_net,
+ };
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+ char *servername = NULL;
+
+ if (!srv_loc->len)
+ continue;
+
+ addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len,
+ &addr, sizeof(addr),
+ clp->cl_net, server->port);
+ if (!addrlen)
+ return;
+ xprt_args.dstaddr = &addr;
+ xprt_args.addrlen = addrlen;
+ servername = kmalloc(srv_loc->len + 1, GFP_KERNEL);
+ if (!servername)
+ return;
+ memcpy(servername, srv_loc->data, srv_loc->len);
+ servername[srv_loc->len] = '\0';
+ xprt_args.servername = servername;
+
+ xprtdata.cred = nfs4_get_clid_cred(clp);
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_setup_test_and_add_xprt,
+ &rpcdata);
+ if (xprtdata.cred)
+ put_cred(xprtdata.cred);
+ kfree(servername);
+ }
+}
+
+static int _nfs4_discover_trunking(struct nfs_server *server,
+ struct nfs_fh *fhandle)
+{
+ struct nfs4_fs_locations *locations = NULL;
+ struct page *page;
+ const struct cred *cred;
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ int status = -ENOMEM, i;
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ return -ENOKEY;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (page == NULL || locations == NULL)
+ goto out;
+
+ status = nfs4_proc_get_locations(server, fhandle, locations, page,
+ cred);
+ if (status)
+ goto out;
+
+ for (i = 0; i < locations->nlocations; i++)
+ test_fs_location_for_trunking(&locations->locations[i], clp,
+ server);
+out:
+ if (page)
+ __free_page(page);
+ kfree(locations);
+ return status;
+}
+
+static int nfs4_discover_trunking(struct nfs_server *server,
+ struct nfs_fh *fhandle)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs_client *clp = server->nfs_client;
+ int err = 0;
+
+ if (!nfs4_has_session(clp))
+ goto out;
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_discover_trunking(server, fhandle),
+ &exception);
+ } while (exception.retry);
+out:
+ return err;
+}
+
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
@@ -4104,7 +4202,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
- struct nfs4_label *label = fattr->label;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -4112,7 +4209,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
+ error = nfs4_proc_getattr(server, mntfh, fattr, NULL);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
goto out;
@@ -4175,8 +4272,7 @@ out:
}
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
__u32 bitmask[NFS4_BITMASK_SZ];
struct nfs4_getattr_arg args = {
@@ -4185,7 +4281,6 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
struct nfs4_getattr_res res = {
.fattr = fattr,
- .label = label,
.server = server,
};
struct rpc_message msg = {
@@ -4202,7 +4297,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
task_flags |= RPC_TASK_TIMEOUT;
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode, 0);
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0);
nfs_fattr_init(fattr);
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
return nfs4_do_call_sync(server->client, server, &msg,
@@ -4210,15 +4305,14 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
}
int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode);
+ err = _nfs4_proc_getattr(server, fhandle, fattr, inode);
trace_nfs4_getattr(server, fhandle, fattr, err);
err = nfs4_handle_exception(server, err,
&exception);
@@ -4250,7 +4344,6 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct inode *inode = d_inode(dentry);
const struct cred *cred = NULL;
struct nfs_open_context *ctx = NULL;
- struct nfs4_label *label = NULL;
int status;
if (pnfs_ld_layoutret_on_setattr(inode) &&
@@ -4276,26 +4369,21 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
cred = ctx->cred;
}
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(label))
- return PTR_ERR(label);
-
/* Return any delegations if we're going to change ACLs */
if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
nfs4_inode_make_writeable(inode);
- status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL);
if (status == 0) {
nfs_setattr_update_inode(inode, sattr, fattr);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
}
- nfs4_label_free(label);
return status;
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
@@ -4307,7 +4395,6 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct nfs4_lookup_res res = {
.server = server,
.fattr = fattr,
- .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -4324,7 +4411,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
if (nfs_lookup_is_soft_revalidate(dentry))
task_flags |= RPC_TASK_TIMEOUT;
- args.bitmask = nfs4_bitmask(server, label);
+ args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
@@ -4346,7 +4433,7 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -4355,7 +4442,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
const struct qstr *name = &dentry->d_name;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr, label);
+ err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr);
trace_nfs4_lookup(dir, name, err);
switch (err) {
case -NFS4ERR_BADNAME:
@@ -4391,13 +4478,12 @@ out:
}
static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, label);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -4412,15 +4498,14 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
struct rpc_clnt *client = NFS_CLIENT(dir);
int status;
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, NULL);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
if (status < 0)
return ERR_PTR(status);
return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
}
static int _nfs4_proc_lookupp(struct inode *inode,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_server *server = NFS_SERVER(inode);
@@ -4432,7 +4517,6 @@ static int _nfs4_proc_lookupp(struct inode *inode,
struct nfs4_lookupp_res res = {
.server = server,
.fattr = fattr,
- .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -4445,7 +4529,7 @@ static int _nfs4_proc_lookupp(struct inode *inode,
if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
task_flags |= RPC_TASK_TIMEOUT;
- args.bitmask = nfs4_bitmask(server, label);
+ args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
@@ -4457,14 +4541,14 @@ static int _nfs4_proc_lookupp(struct inode *inode,
}
static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_proc_lookupp(inode, fhandle, fattr, label);
+ err = _nfs4_proc_lookupp(inode, fhandle, fattr);
trace_nfs4_lookupp(inode, err);
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
@@ -4472,7 +4556,8 @@ static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
return err;
}
-static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_accessargs args = {
@@ -4486,7 +4571,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
.rpc_argp = &args,
.rpc_resp = &res,
- .rpc_cred = entry->cred,
+ .rpc_cred = cred,
};
int status = 0;
@@ -4506,14 +4591,15 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
return status;
}
-static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_proc_access(inode, entry);
+ err = _nfs4_proc_access(inode, entry, cred);
trace_nfs4_access(inode, err);
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
@@ -4694,8 +4780,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg,
nfs_fattr_init(res->dir_attr);
- if (inode)
+ if (inode) {
nfs4_inode_return_delegation(inode);
+ nfs_d_prune_case_insensitive_aliases(inode);
+ }
}
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -4761,6 +4849,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
return 0;
if (task->tk_status == 0) {
+ nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry));
if (new_dir != old_dir) {
/* Note: If we moved a directory, nlink will change */
nfs4_update_changeattr(old_dir, &res->old_cinfo,
@@ -4791,7 +4880,6 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
};
struct nfs4_link_res res = {
.server = server,
- .label = NULL,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -4800,18 +4888,12 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
};
int status = -ENOMEM;
- res.fattr = nfs_alloc_fattr();
+ res.fattr = nfs_alloc_fattr_with_label(server);
if (res.fattr == NULL)
goto out;
- res.label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(res.label)) {
- status = PTR_ERR(res.label);
- goto out;
- }
-
nfs4_inode_make_writeable(inode);
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.label), inode,
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), inode,
NFS_INO_INVALID_CHANGE);
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
@@ -4820,12 +4902,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
nfs4_inc_nlink(inode);
status = nfs_post_op_update_inode(inode, res.fattr);
if (!status)
- nfs_setsecurity(inode, res.fattr, res.label);
+ nfs_setsecurity(inode, res.fattr);
}
-
- nfs4_label_free(res.label);
-
out:
nfs_free_fattr(res.fattr);
return status;
@@ -4851,7 +4930,6 @@ struct nfs4_createdata {
struct nfs4_create_res res;
struct nfs_fh fh;
struct nfs_fattr fattr;
- struct nfs4_label *label;
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -4863,8 +4941,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
if (data != NULL) {
struct nfs_server *server = NFS_SERVER(dir);
- data->label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(data->label))
+ data->fattr.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->fattr.label))
goto out_free;
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
@@ -4875,12 +4953,11 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.name = name;
data->arg.attrs = sattr;
data->arg.ftype = ftype;
- data->arg.bitmask = nfs4_bitmask(server, data->label);
+ data->arg.bitmask = nfs4_bitmask(server, data->fattr.label);
data->arg.umask = current_umask();
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
- data->res.label = data->label;
nfs_fattr_init(data->res.fattr);
}
return data;
@@ -4902,14 +4979,14 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
spin_unlock(&dir->i_lock);
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
}
return status;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
- nfs4_label_free(data->label);
+ nfs4_label_free(data->fattr.label);
kfree(data);
}
@@ -5347,8 +5424,6 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- dprintk("--> %s\n", __func__);
-
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -5467,14 +5542,14 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
-static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
- struct inode *inode, struct nfs_server *server,
- struct nfs4_label *label)
+void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
+ struct inode *inode, unsigned long cache_validity)
{
- unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ struct nfs_server *server = NFS_SERVER(inode);
unsigned int i;
memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
+ cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity);
if (cache_validity & NFS_INO_INVALID_CHANGE)
bitmask[0] |= FATTR4_WORD0_CHANGE;
@@ -5486,8 +5561,6 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP;
if (cache_validity & NFS_INO_INVALID_NLINK)
bitmask[1] |= FATTR4_WORD1_NUMLINKS;
- if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
- bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
if (cache_validity & NFS_INO_INVALID_CTIME)
bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
if (cache_validity & NFS_INO_INVALID_MTIME)
@@ -5514,7 +5587,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
} else {
nfs4_bitmask_set(hdr->args.bitmask_store,
server->cache_consistency_bitmask,
- hdr->inode, server, NULL);
+ hdr->inode, NFS_INO_INVALID_BLOCKS);
hdr->args.bitmask = hdr->args.bitmask_store;
}
@@ -5836,7 +5909,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
buflen = server->rsize;
npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1;
- pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS);
+ pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -6004,17 +6077,18 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
size_t buflen)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_fattr fattr;
struct nfs4_label label = {0, 0, buflen, buf};
u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_fattr fattr = {
+ .label = &label,
+ };
struct nfs4_getattr_arg arg = {
.fh = NFS_FH(inode),
.bitmask = bitmask,
};
struct nfs4_getattr_res res = {
.fattr = &fattr,
- .label = &label,
.server = server,
};
struct rpc_message msg = {
@@ -6056,8 +6130,7 @@ static int nfs4_get_security_label(struct inode *inode, void *buf,
static int _nfs4_do_set_security_label(struct inode *inode,
struct nfs4_label *ilabel,
- struct nfs_fattr *fattr,
- struct nfs4_label *olabel)
+ struct nfs_fattr *fattr)
{
struct iattr sattr = {0};
@@ -6072,7 +6145,6 @@ static int _nfs4_do_set_security_label(struct inode *inode,
};
struct nfs_setattrres res = {
.fattr = fattr,
- .label = olabel,
.server = server,
};
struct rpc_message msg = {
@@ -6093,15 +6165,13 @@ static int _nfs4_do_set_security_label(struct inode *inode,
static int nfs4_do_set_security_label(struct inode *inode,
struct nfs4_label *ilabel,
- struct nfs_fattr *fattr,
- struct nfs4_label *olabel)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs4_do_set_security_label(inode, ilabel,
- fattr, olabel);
+ err = _nfs4_do_set_security_label(inode, ilabel, fattr);
trace_nfs4_set_security_label(inode, err);
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
@@ -6112,32 +6182,21 @@ static int nfs4_do_set_security_label(struct inode *inode,
static int
nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
- struct nfs4_label ilabel, *olabel = NULL;
- struct nfs_fattr fattr;
+ struct nfs4_label ilabel = {0, 0, buflen, (char *)buf };
+ struct nfs_fattr *fattr;
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
return -EOPNOTSUPP;
- nfs_fattr_init(&fattr);
-
- ilabel.pi = 0;
- ilabel.lfs = 0;
- ilabel.label = (char *)buf;
- ilabel.len = buflen;
-
- olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(olabel)) {
- status = -PTR_ERR(olabel);
- goto out;
- }
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fattr == NULL)
+ return -ENOMEM;
- status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ status = nfs4_do_set_security_label(inode, &ilabel, fattr);
if (status == 0)
- nfs_setsecurity(inode, &fattr, olabel);
+ nfs_setsecurity(inode, fattr);
- nfs4_label_free(olabel);
-out:
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
@@ -6502,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata)
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -6555,7 +6616,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
};
int status = 0;
- data = kzalloc(sizeof(*data), GFP_NOFS);
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -6566,8 +6627,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
nfs4_bitmask_set(data->args.bitmask_store,
- server->cache_consistency_bitmask, inode, server,
- NULL);
+ server->cache_consistency_bitmask, inode, 0);
data->args.bitmask = data->args.bitmask_store;
nfs_copy_fh(&data->fh, NFS_FH(inode));
nfs4_stateid_copy(&data->stateid, stateid);
@@ -6744,7 +6804,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_state *state = lsp->ls_state;
struct inode *inode = state->inode;
- p = kzalloc(sizeof(*p), GFP_NOFS);
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
p->arg.fh = NFS_FH(inode);
@@ -7003,7 +7063,6 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
struct nfs4_lockdata *data = calldata;
struct nfs4_state *state = data->lsp->ls_state;
- dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
goto out_wait;
/* Do we need to do an open_to_lock_owner? */
@@ -7037,7 +7096,7 @@ out_release_lock_seqid:
nfs_release_seqid(data->arg.lock_seqid);
out_wait:
nfs4_sequence_done(task, &data->res.seq_res);
- dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+ dprintk("%s: ret = %d\n", __func__, data->rpc_status);
}
static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -7045,8 +7104,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
- dprintk("%s: begin!\n", __func__);
-
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -7080,7 +7137,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
goto out_restart;
}
out_done:
- dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
+ dprintk("%s: ret = %d!\n", __func__, data->rpc_status);
return;
out_restart:
if (!data->cancelled)
@@ -7092,7 +7149,6 @@ static void nfs4_lock_release(void *calldata)
{
struct nfs4_lockdata *data = calldata;
- dprintk("%s: begin!\n", __func__);
nfs_free_seqid(data->arg.open_seqid);
if (data->cancelled && data->rpc_status == 0) {
struct rpc_task *task;
@@ -7106,7 +7162,6 @@ static void nfs4_lock_release(void *calldata)
nfs4_put_lock_state(data->lsp);
put_nfs_open_context(data->ctx);
kfree(data);
- dprintk("%s: done!\n", __func__);
}
static const struct rpc_call_ops nfs4_lock_ops = {
@@ -7153,10 +7208,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
if (client->cl_minorversion)
task_setup_data.flags |= RPC_TASK_MOVEABLE;
- dprintk("%s: begin!\n", __func__);
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
- fl->fl_u.nfs4_fl.owner,
- recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+ fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
if (IS_SETLKW(cmd))
@@ -7184,7 +7237,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = true;
trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
rpc_put_task(task);
- dprintk("%s: done, ret = %d!\n", __func__, ret);
+ dprintk("%s: ret = %d\n", __func__, ret);
return ret;
}
@@ -7579,7 +7632,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
if (server->nfs_client->cl_mvops->minor_version != 0)
return;
- data = kmalloc(sizeof(*data), GFP_NOFS);
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return;
data->lsp = lsp;
@@ -7676,7 +7729,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
const char *key, const void *buf,
size_t buflen, int flags)
{
- struct nfs_access_entry cache;
+ u32 mask;
int ret;
if (!nfs_server_capable(inode, NFS_CAP_XATTR))
@@ -7691,8 +7744,8 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
* do a cached access check for the XA* flags to possibly avoid
* doing an RPC and getting EACCES back.
*/
- if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
- if (!(cache.mask & NFS_ACCESS_XAWRITE))
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAWRITE))
return -EACCES;
}
@@ -7713,14 +7766,14 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *key, void *buf, size_t buflen)
{
- struct nfs_access_entry cache;
+ u32 mask;
ssize_t ret;
if (!nfs_server_capable(inode, NFS_CAP_XATTR))
return -EOPNOTSUPP;
- if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
- if (!(cache.mask & NFS_ACCESS_XAREAD))
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAREAD))
return -EACCES;
}
@@ -7745,13 +7798,13 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
ssize_t ret, size;
char *buf;
size_t buflen;
- struct nfs_access_entry cache;
+ u32 mask;
if (!nfs_server_capable(inode, NFS_CAP_XATTR))
return 0;
- if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
- if (!(cache.mask & NFS_ACCESS_XALIST))
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XALIST))
return 0;
}
@@ -7883,18 +7936,18 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
* appended to this compound to identify the client ID which is
* performing recovery.
*/
-static int _nfs40_proc_get_locations(struct inode *inode,
+static int _nfs40_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
struct nfs4_fs_locations *locations,
struct page *page, const struct cred *cred)
{
- struct nfs_server *server = NFS_SERVER(inode);
struct rpc_clnt *clnt = server->client;
u32 bitmask[2] = {
[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
};
struct nfs4_fs_locations_arg args = {
.clientid = server->nfs_client->cl_clientid,
- .fh = NFS_FH(inode),
+ .fh = fhandle,
.page = page,
.bitmask = bitmask,
.migration = 1, /* skip LOOKUP */
@@ -7940,17 +7993,17 @@ static int _nfs40_proc_get_locations(struct inode *inode,
* When the client supports GETATTR(fs_locations_info), it can
* be plumbed in here.
*/
-static int _nfs41_proc_get_locations(struct inode *inode,
+static int _nfs41_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
struct nfs4_fs_locations *locations,
struct page *page, const struct cred *cred)
{
- struct nfs_server *server = NFS_SERVER(inode);
struct rpc_clnt *clnt = server->client;
u32 bitmask[2] = {
[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
};
struct nfs4_fs_locations_arg args = {
- .fh = NFS_FH(inode),
+ .fh = fhandle,
.page = page,
.bitmask = bitmask,
.migration = 1, /* skip LOOKUP */
@@ -7965,6 +8018,18 @@ static int _nfs41_proc_get_locations(struct inode *inode,
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
int status;
nfs_fattr_init(&locations->fattr);
@@ -7972,8 +8037,7 @@ static int _nfs41_proc_get_locations(struct inode *inode,
locations->nlocations = 0;
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
- status = nfs4_call_sync_sequence(clnt, server, &msg,
- &args.seq_args, &res.seq_res);
+ status = nfs4_call_sync_custom(&task_setup_data);
if (status == NFS4_OK &&
res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
status = -NFS4ERR_LEASE_MOVED;
@@ -7984,7 +8048,8 @@ static int _nfs41_proc_get_locations(struct inode *inode,
/**
* nfs4_proc_get_locations - discover locations for a migrated FSID
- * @inode: inode on FSID that is migrating
+ * @server: pointer to nfs_server to process
+ * @fhandle: pointer to the kernel NFS client file handle
* @locations: result of query
* @page: buffer
* @cred: credential to use for this operation
@@ -7999,11 +8064,11 @@ static int _nfs41_proc_get_locations(struct inode *inode,
* -NFS4ERR_LEASE_MOVED is returned if the server still has leases
* from this client that require migration recovery.
*/
-int nfs4_proc_get_locations(struct inode *inode,
+int nfs4_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
struct nfs4_fs_locations *locations,
struct page *page, const struct cred *cred)
{
- struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = server->nfs_client;
const struct nfs4_mig_recovery_ops *ops =
clp->cl_mvops->mig_recovery_ops;
@@ -8016,10 +8081,11 @@ int nfs4_proc_get_locations(struct inode *inode,
(unsigned long long)server->fsid.major,
(unsigned long long)server->fsid.minor,
clp->cl_hostname);
- nfs_display_fhandle(NFS_FH(inode), __func__);
+ nfs_display_fhandle(fhandle, __func__);
do {
- status = ops->get_locations(inode, locations, page, cred);
+ status = ops->get_locations(server, fhandle, locations, page,
+ cred);
if (status != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, status, &exception);
@@ -8284,6 +8350,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_DEADSESSION:
nfs4_schedule_session_recovery(clp->cl_session,
task->tk_status);
+ return;
}
if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
res->dir != NFS4_CDFS4_BOTH) {
@@ -8855,14 +8922,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
struct nfs4_get_lease_time_data *data =
(struct nfs4_get_lease_time_data *)calldata;
- dprintk("--> %s\n", __func__);
/* just setup sequence, do not trigger session recovery
since we're invoked within one */
nfs4_setup_sequence(data->clp,
&data->args->la_seq_args,
&data->res->lr_seq_res,
task);
- dprintk("<-- %s\n", __func__);
}
/*
@@ -8874,13 +8939,11 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
struct nfs4_get_lease_time_data *data =
(struct nfs4_get_lease_time_data *)calldata;
- dprintk("--> %s\n", __func__);
if (!nfs4_sequence_done(task, &data->res->lr_seq_res))
return;
switch (task->tk_status) {
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
- dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
rpc_delay(task, NFS4_POLL_RETRY_MIN);
task->tk_status = 0;
fallthrough;
@@ -8888,7 +8951,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
rpc_restart_call_prepare(task);
return;
}
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_get_lease_time_ops = {
@@ -9120,7 +9182,6 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
out:
- dprintk("<-- %s\n", __func__);
return status;
}
@@ -9138,8 +9199,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
};
int status = 0;
- dprintk("--> nfs4_proc_destroy_session\n");
-
/* session is still being setup */
if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
return 0;
@@ -9151,8 +9210,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
if (status)
dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
"Session has been destroyed regardless...\n", status);
-
- dprintk("<-- nfs4_proc_destroy_session\n");
return status;
}
@@ -9200,7 +9257,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
if (task->tk_status < 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
if (refcount_read(&clp->cl_count) == 1)
- goto out;
+ return;
if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
rpc_restart_call_prepare(task);
@@ -9208,8 +9265,6 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
}
}
dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
-out:
- dprintk("<-- %s\n", __func__);
}
static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
@@ -9254,7 +9309,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
goto out_err;
ret = ERR_PTR(-ENOMEM);
- calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
if (calldata == NULL)
goto out_put_clp;
nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged);
@@ -9356,7 +9411,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
struct nfs_client *clp = calldata->clp;
struct nfs4_sequence_res *res = &calldata->res.seq_res;
- dprintk("--> %s\n", __func__);
if (!nfs41_sequence_done(task, res))
return;
@@ -9365,7 +9419,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
rpc_restart_call_prepare(task);
return;
}
- dprintk("<-- %s\n", __func__);
}
static void nfs4_free_reclaim_complete_data(void *data)
@@ -9400,7 +9453,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
};
int status = -ENOMEM;
- dprintk("--> %s\n", __func__);
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
if (calldata == NULL)
goto out;
@@ -9423,19 +9475,15 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
- dprintk("--> %s\n", __func__);
nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args,
&lgp->res.seq_res, task);
- dprintk("<-- %s\n", __func__);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- dprintk("--> %s\n", __func__);
nfs41_sequence_process(task, &lgp->res.seq_res);
- dprintk("<-- %s\n", __func__);
}
static int
@@ -9524,7 +9572,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = err;
}
out:
- dprintk("<-- %s\n", __func__);
return status;
}
@@ -9538,10 +9585,8 @@ static void nfs4_layoutget_release(void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- dprintk("--> %s\n", __func__);
nfs4_sequence_free_slot(&lgp->res.seq_res);
pnfs_layoutget_free(lgp);
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_layoutget_call_ops = {
@@ -9577,11 +9622,11 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
};
int status = 0;
- dprintk("--> %s\n", __func__);
-
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return ERR_CAST(task);
status = rpc_wait_for_completion_task(task);
if (status != 0)
@@ -9614,7 +9659,6 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
- dprintk("--> %s\n", __func__);
nfs4_setup_sequence(lrp->clp,
&lrp->args.seq_args,
&lrp->res.seq_res,
@@ -9628,8 +9672,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
struct nfs4_layoutreturn *lrp = calldata;
struct nfs_server *server;
- dprintk("--> %s\n", __func__);
-
if (!nfs41_sequence_process(task, &lrp->res.seq_res))
return;
@@ -9660,7 +9702,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
break;
goto out_restart;
}
- dprintk("<-- %s\n", __func__);
return;
out_restart:
task->tk_status = 0;
@@ -9673,7 +9714,6 @@ static void nfs4_layoutreturn_release(void *calldata)
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
- dprintk("--> %s\n", __func__);
pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
lrp->res.lrs_present ? &lrp->res.stateid : NULL);
nfs4_sequence_free_slot(&lrp->res.seq_res);
@@ -9683,7 +9723,6 @@ static void nfs4_layoutreturn_release(void *calldata)
nfs_iput_and_deactive(lrp->inode);
put_cred(lrp->cred);
kfree(calldata);
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
@@ -9714,7 +9753,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
NFS_SP4_MACH_CRED_PNFS_CLEANUP,
&task_setup_data.rpc_client, &msg);
- dprintk("--> %s\n", __func__);
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!sync) {
if (!lrp->inode) {
@@ -9761,7 +9799,6 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
};
int status;
- dprintk("--> %s\n", __func__);
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (res.notification & ~args.notify_types)
dprintk("%s: unsupported notification\n", __func__);
@@ -9933,7 +9970,6 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
msg.rpc_cred = cred;
}
- dprintk("--> %s\n", __func__);
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
status = nfs4_call_sync_custom(&task_setup);
dprintk("<-- %s status=%d\n", __func__, status);
@@ -10157,6 +10193,10 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
static void nfs41_free_stateid_release(void *calldata)
{
+ struct nfs_free_stateid_data *data = calldata;
+ struct nfs_client *clp = data->server->nfs_client;
+
+ nfs_put_client(clp);
kfree(calldata);
}
@@ -10193,12 +10233,16 @@ static int nfs41_free_stateid(struct nfs_server *server,
};
struct nfs_free_stateid_data *data;
struct rpc_task *task;
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return -EIO;
nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
&task_setup.rpc_client, &msg);
dprintk("NFS call free_stateid %p\n", stateid);
- data = kmalloc(sizeof(*data), GFP_NOFS);
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
data->server = server;
@@ -10437,6 +10481,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
return error + error2 + error3;
}
+static void nfs4_enable_swap(struct inode *inode)
+{
+ /* The state manager thread must always be running.
+ * It will notice the client is a swapper, and stay put.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_disable_swap(struct inode *inode)
+{
+ /* The state manager thread will now exit once it is
+ * woken.
+ */
+ wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state);
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -10513,6 +10575,9 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.free_client = nfs4_free_client,
.create_server = nfs4_create_server,
.clone_server = nfs_clone_server,
+ .discover_trunking = nfs4_discover_trunking,
+ .enable_swap = nfs4_enable_swap,
+ .disable_swap = nfs4_disable_swap,
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 4145a0138907..5db460476bf2 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -511,12 +511,16 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
struct nfs4_slot *slot,
struct nfs4_sequence_res *res)
{
+ u32 target_highest_slotid = min(res->sr_target_highest_slotid,
+ NFS4_MAX_SLOTID);
+ u32 highest_slotid = min(res->sr_highest_slotid, NFS4_MAX_SLOTID);
+
spin_lock(&tbl->slot_tbl_lock);
- if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
- nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+ if (!nfs41_is_outlier_target_slotid(tbl, target_highest_slotid))
+ nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
if (tbl->generation == slot->generation)
- nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
- nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+ nfs41_set_server_slotid_locked(tbl, highest_slotid);
+ nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
spin_unlock(&tbl->slot_tbl_lock);
}
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3de425f59b3a..351616c61df5 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -12,6 +12,7 @@
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_MAX_SLOTID (NFS4_MAX_SLOT_TABLE - 1U)
#define NFS4_NO_SLOT ((u32)-1)
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f22818a80c2c..9e1c987c81e7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
@@ -666,7 +667,7 @@ nfs4_alloc_open_state(void)
{
struct nfs4_state *state;
- state = kzalloc(sizeof(*state), GFP_NOFS);
+ state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT);
if (!state)
return NULL;
refcount_set(&state->count, 1);
@@ -820,7 +821,7 @@ static void __nfs4_close(struct nfs4_state *state,
void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(state, fmode, GFP_NOFS, 0);
+ __nfs4_close(state, fmode, GFP_KERNEL, 0);
}
void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
@@ -869,14 +870,15 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
- lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+ lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT);
if (lsp == NULL)
return NULL;
nfs4_init_seqid_counter(&lsp->ls_seqid);
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = fl_owner;
- lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+ lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id,
+ 0, 0, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
@@ -1194,10 +1196,7 @@ static int nfs4_run_state_manager(void *);
static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
{
- smp_mb__before_atomic();
- clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
- smp_mb__after_atomic();
- wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
+ clear_and_wake_up_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
rpc_wake_up(&clp->cl_rpcwaitq);
}
@@ -1208,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
- if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
+ wake_up_var(&clp->cl_state);
return;
+ }
+ set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
__module_get(THIS_MODULE);
refcount_inc(&clp->cl_count);
@@ -1227,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
nfs4_clear_state_manager_bit(clp);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -2001,6 +2008,10 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
dprintk("%s: exit with error %d for server %s\n",
__func__, -EPROTONOSUPPORT, clp->cl_hostname);
return -EPROTONOSUPPORT;
+ case -ENOSPC:
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+ nfs_mark_client_ready(clp, -EIO);
+ return -EIO;
case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
* in nfs4_exchange_id */
default:
@@ -2097,7 +2108,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
}
inode = d_inode(server->super->s_root);
- result = nfs4_proc_get_locations(inode, locations, page, cred);
+ result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
+ page, cred);
if (result) {
dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
__func__, result);
@@ -2105,6 +2117,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
}
result = -NFS4ERR_NXIO;
+ if (!locations->nlocations)
+ goto out;
+
if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
dprintk("<-- %s: No fs_locations data, migration skipped\n",
__func__);
@@ -2555,9 +2570,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
static void nfs4_state_manager(struct nfs_client *clp)
{
+ unsigned int memflags;
int status = 0;
const char *section = "", *section_sep = "";
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
/* Ensure exclusive access to NFSv4 state */
do {
trace_nfs4_state_mgr(clp);
@@ -2652,6 +2675,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
}
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
@@ -2664,11 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
}
- /* Did we race with an attempt to give us more work? */
- if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
- return;
- if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
- return;
+ return;
+
} while (refcount_read(&clp->cl_count) > 1 && !signalled());
goto out_drain;
@@ -2681,6 +2702,7 @@ out_error:
clp->cl_hostname, -status);
ssleep(1);
out_drain:
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
}
@@ -2688,10 +2710,31 @@ out_drain:
static int nfs4_run_state_manager(void *ptr)
{
struct nfs_client *clp = ptr;
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
allow_signal(SIGKILL);
+again:
+ set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
nfs4_state_manager(clp);
+ if (atomic_read(&cl->cl_swapper)) {
+ wait_var_event_interruptible(&clp->cl_state,
+ test_bit(NFS4CLNT_RUN_MANAGER,
+ &clp->cl_state));
+ if (atomic_read(&cl->cl_swapper) &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ goto again;
+ /* Either no longer a swapper, or were signalled */
+ }
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+ goto again;
+
nfs_put_client(clp);
- module_put_and_exit(0);
return 0;
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 7a2567aa2b86..6ee6ad3674a2 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -9,322 +9,10 @@
#define _TRACE_NFS4_H
#include <linux/tracepoint.h>
+#include <trace/events/sunrpc_base.h>
-TRACE_DEFINE_ENUM(EPERM);
-TRACE_DEFINE_ENUM(ENOENT);
-TRACE_DEFINE_ENUM(EIO);
-TRACE_DEFINE_ENUM(ENXIO);
-TRACE_DEFINE_ENUM(EACCES);
-TRACE_DEFINE_ENUM(EEXIST);
-TRACE_DEFINE_ENUM(EXDEV);
-TRACE_DEFINE_ENUM(ENOTDIR);
-TRACE_DEFINE_ENUM(EISDIR);
-TRACE_DEFINE_ENUM(EFBIG);
-TRACE_DEFINE_ENUM(ENOSPC);
-TRACE_DEFINE_ENUM(EROFS);
-TRACE_DEFINE_ENUM(EMLINK);
-TRACE_DEFINE_ENUM(ENAMETOOLONG);
-TRACE_DEFINE_ENUM(ENOTEMPTY);
-TRACE_DEFINE_ENUM(EDQUOT);
-TRACE_DEFINE_ENUM(ESTALE);
-TRACE_DEFINE_ENUM(EBADHANDLE);
-TRACE_DEFINE_ENUM(EBADCOOKIE);
-TRACE_DEFINE_ENUM(ENOTSUPP);
-TRACE_DEFINE_ENUM(ETOOSMALL);
-TRACE_DEFINE_ENUM(EREMOTEIO);
-TRACE_DEFINE_ENUM(EBADTYPE);
-TRACE_DEFINE_ENUM(EAGAIN);
-TRACE_DEFINE_ENUM(ELOOP);
-TRACE_DEFINE_ENUM(EOPNOTSUPP);
-TRACE_DEFINE_ENUM(EDEADLK);
-TRACE_DEFINE_ENUM(ENOMEM);
-TRACE_DEFINE_ENUM(EKEYEXPIRED);
-TRACE_DEFINE_ENUM(ETIMEDOUT);
-TRACE_DEFINE_ENUM(ERESTARTSYS);
-TRACE_DEFINE_ENUM(ECONNREFUSED);
-TRACE_DEFINE_ENUM(ECONNRESET);
-TRACE_DEFINE_ENUM(ENETUNREACH);
-TRACE_DEFINE_ENUM(EHOSTUNREACH);
-TRACE_DEFINE_ENUM(EHOSTDOWN);
-TRACE_DEFINE_ENUM(EPIPE);
-TRACE_DEFINE_ENUM(EPFNOSUPPORT);
-TRACE_DEFINE_ENUM(EPROTONOSUPPORT);
-
-TRACE_DEFINE_ENUM(NFS4_OK);
-TRACE_DEFINE_ENUM(NFS4ERR_ACCESS);
-TRACE_DEFINE_ENUM(NFS4ERR_ATTRNOTSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_ADMIN_REVOKED);
-TRACE_DEFINE_ENUM(NFS4ERR_BACK_CHAN_BUSY);
-TRACE_DEFINE_ENUM(NFS4ERR_BADCHAR);
-TRACE_DEFINE_ENUM(NFS4ERR_BADHANDLE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADIOMODE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADLAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_BADLABEL);
-TRACE_DEFINE_ENUM(NFS4ERR_BADNAME);
-TRACE_DEFINE_ENUM(NFS4ERR_BADOWNER);
-TRACE_DEFINE_ENUM(NFS4ERR_BADSESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_BADSLOT);
-TRACE_DEFINE_ENUM(NFS4ERR_BADTYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADXDR);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_COOKIE);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_HIGH_SLOT);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_RANGE);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_SEQID);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_SESSION_DIGEST);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_CB_PATH_DOWN);
-TRACE_DEFINE_ENUM(NFS4ERR_CLID_INUSE);
-TRACE_DEFINE_ENUM(NFS4ERR_CLIENTID_BUSY);
-TRACE_DEFINE_ENUM(NFS4ERR_COMPLETE_ALREADY);
-TRACE_DEFINE_ENUM(NFS4ERR_CONN_NOT_BOUND_TO_SESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_DEADLOCK);
-TRACE_DEFINE_ENUM(NFS4ERR_DEADSESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_DELAY);
-TRACE_DEFINE_ENUM(NFS4ERR_DELEG_ALREADY_WANTED);
-TRACE_DEFINE_ENUM(NFS4ERR_DELEG_REVOKED);
-TRACE_DEFINE_ENUM(NFS4ERR_DENIED);
-TRACE_DEFINE_ENUM(NFS4ERR_DIRDELEG_UNAVAIL);
-TRACE_DEFINE_ENUM(NFS4ERR_DQUOT);
-TRACE_DEFINE_ENUM(NFS4ERR_ENCR_ALG_UNSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_EXIST);
-TRACE_DEFINE_ENUM(NFS4ERR_EXPIRED);
-TRACE_DEFINE_ENUM(NFS4ERR_FBIG);
-TRACE_DEFINE_ENUM(NFS4ERR_FHEXPIRED);
-TRACE_DEFINE_ENUM(NFS4ERR_FILE_OPEN);
-TRACE_DEFINE_ENUM(NFS4ERR_GRACE);
-TRACE_DEFINE_ENUM(NFS4ERR_HASH_ALG_UNSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_INVAL);
-TRACE_DEFINE_ENUM(NFS4ERR_IO);
-TRACE_DEFINE_ENUM(NFS4ERR_ISDIR);
-TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTTRYLATER);
-TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTUNAVAILABLE);
-TRACE_DEFINE_ENUM(NFS4ERR_LEASE_MOVED);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCKED);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCKS_HELD);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCK_RANGE);
-TRACE_DEFINE_ENUM(NFS4ERR_MINOR_VERS_MISMATCH);
-TRACE_DEFINE_ENUM(NFS4ERR_MLINK);
-TRACE_DEFINE_ENUM(NFS4ERR_MOVED);
-TRACE_DEFINE_ENUM(NFS4ERR_NAMETOOLONG);
-TRACE_DEFINE_ENUM(NFS4ERR_NOENT);
-TRACE_DEFINE_ENUM(NFS4ERR_NOFILEHANDLE);
-TRACE_DEFINE_ENUM(NFS4ERR_NOMATCHING_LAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_NOSPC);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTDIR);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTEMPTY);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_NOT_ONLY_OP);
-TRACE_DEFINE_ENUM(NFS4ERR_NOT_SAME);
-TRACE_DEFINE_ENUM(NFS4ERR_NO_GRACE);
-TRACE_DEFINE_ENUM(NFS4ERR_NXIO);
-TRACE_DEFINE_ENUM(NFS4ERR_OLD_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_OPENMODE);
-TRACE_DEFINE_ENUM(NFS4ERR_OP_ILLEGAL);
-TRACE_DEFINE_ENUM(NFS4ERR_OP_NOT_IN_SESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_PERM);
-TRACE_DEFINE_ENUM(NFS4ERR_PNFS_IO_HOLE);
-TRACE_DEFINE_ENUM(NFS4ERR_PNFS_NO_LAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_RECALLCONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_BAD);
-TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_CONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_REJECT_DELEG);
-TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG);
-TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG_TO_CACHE);
-TRACE_DEFINE_ENUM(NFS4ERR_REQ_TOO_BIG);
-TRACE_DEFINE_ENUM(NFS4ERR_RESOURCE);
-TRACE_DEFINE_ENUM(NFS4ERR_RESTOREFH);
-TRACE_DEFINE_ENUM(NFS4ERR_RETRY_UNCACHED_REP);
-TRACE_DEFINE_ENUM(NFS4ERR_RETURNCONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_ROFS);
-TRACE_DEFINE_ENUM(NFS4ERR_SAME);
-TRACE_DEFINE_ENUM(NFS4ERR_SHARE_DENIED);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQUENCE_POS);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQ_FALSE_RETRY);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQ_MISORDERED);
-TRACE_DEFINE_ENUM(NFS4ERR_SERVERFAULT);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE_CLIENTID);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_SYMLINK);
-TRACE_DEFINE_ENUM(NFS4ERR_TOOSMALL);
-TRACE_DEFINE_ENUM(NFS4ERR_TOO_MANY_OPS);
-TRACE_DEFINE_ENUM(NFS4ERR_UNKNOWN_LAYOUTTYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_UNSAFE_COMPOUND);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONGSEC);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONG_CRED);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_XDEV);
-
-TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_MDS);
-TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_PNFS);
-
-#define show_nfsv4_errors(error) \
- __print_symbolic(error, \
- { NFS4_OK, "OK" }, \
- /* Mapped by nfs4_stat_to_errno() */ \
- { EPERM, "EPERM" }, \
- { ENOENT, "ENOENT" }, \
- { EIO, "EIO" }, \
- { ENXIO, "ENXIO" }, \
- { EACCES, "EACCES" }, \
- { EEXIST, "EEXIST" }, \
- { EXDEV, "EXDEV" }, \
- { ENOTDIR, "ENOTDIR" }, \
- { EISDIR, "EISDIR" }, \
- { EFBIG, "EFBIG" }, \
- { ENOSPC, "ENOSPC" }, \
- { EROFS, "EROFS" }, \
- { EMLINK, "EMLINK" }, \
- { ENAMETOOLONG, "ENAMETOOLONG" }, \
- { ENOTEMPTY, "ENOTEMPTY" }, \
- { EDQUOT, "EDQUOT" }, \
- { ESTALE, "ESTALE" }, \
- { EBADHANDLE, "EBADHANDLE" }, \
- { EBADCOOKIE, "EBADCOOKIE" }, \
- { ENOTSUPP, "ENOTSUPP" }, \
- { ETOOSMALL, "ETOOSMALL" }, \
- { EREMOTEIO, "EREMOTEIO" }, \
- { EBADTYPE, "EBADTYPE" }, \
- { EAGAIN, "EAGAIN" }, \
- { ELOOP, "ELOOP" }, \
- { EOPNOTSUPP, "EOPNOTSUPP" }, \
- { EDEADLK, "EDEADLK" }, \
- /* RPC errors */ \
- { ENOMEM, "ENOMEM" }, \
- { EKEYEXPIRED, "EKEYEXPIRED" }, \
- { ETIMEDOUT, "ETIMEDOUT" }, \
- { ERESTARTSYS, "ERESTARTSYS" }, \
- { ECONNREFUSED, "ECONNREFUSED" }, \
- { ECONNRESET, "ECONNRESET" }, \
- { ENETUNREACH, "ENETUNREACH" }, \
- { EHOSTUNREACH, "EHOSTUNREACH" }, \
- { EHOSTDOWN, "EHOSTDOWN" }, \
- { EPIPE, "EPIPE" }, \
- { EPFNOSUPPORT, "EPFNOSUPPORT" }, \
- { EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
- /* NFSv4 native errors */ \
- { NFS4ERR_ACCESS, "ACCESS" }, \
- { NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
- { NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
- { NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
- { NFS4ERR_BADCHAR, "BADCHAR" }, \
- { NFS4ERR_BADHANDLE, "BADHANDLE" }, \
- { NFS4ERR_BADIOMODE, "BADIOMODE" }, \
- { NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
- { NFS4ERR_BADLABEL, "BADLABEL" }, \
- { NFS4ERR_BADNAME, "BADNAME" }, \
- { NFS4ERR_BADOWNER, "BADOWNER" }, \
- { NFS4ERR_BADSESSION, "BADSESSION" }, \
- { NFS4ERR_BADSLOT, "BADSLOT" }, \
- { NFS4ERR_BADTYPE, "BADTYPE" }, \
- { NFS4ERR_BADXDR, "BADXDR" }, \
- { NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
- { NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
- { NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
- { NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
- { NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
- { NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
- { NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
- { NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
- { NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
- { NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
- { NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
- "CONN_NOT_BOUND_TO_SESSION" }, \
- { NFS4ERR_DEADLOCK, "DEADLOCK" }, \
- { NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
- { NFS4ERR_DELAY, "DELAY" }, \
- { NFS4ERR_DELEG_ALREADY_WANTED, \
- "DELEG_ALREADY_WANTED" }, \
- { NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
- { NFS4ERR_DENIED, "DENIED" }, \
- { NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
- { NFS4ERR_DQUOT, "DQUOT" }, \
- { NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
- { NFS4ERR_EXIST, "EXIST" }, \
- { NFS4ERR_EXPIRED, "EXPIRED" }, \
- { NFS4ERR_FBIG, "FBIG" }, \
- { NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
- { NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
- { NFS4ERR_GRACE, "GRACE" }, \
- { NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
- { NFS4ERR_INVAL, "INVAL" }, \
- { NFS4ERR_IO, "IO" }, \
- { NFS4ERR_ISDIR, "ISDIR" }, \
- { NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
- { NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
- { NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
- { NFS4ERR_LOCKED, "LOCKED" }, \
- { NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
- { NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
- { NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
- { NFS4ERR_MLINK, "MLINK" }, \
- { NFS4ERR_MOVED, "MOVED" }, \
- { NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
- { NFS4ERR_NOENT, "NOENT" }, \
- { NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
- { NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
- { NFS4ERR_NOSPC, "NOSPC" }, \
- { NFS4ERR_NOTDIR, "NOTDIR" }, \
- { NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
- { NFS4ERR_NOTSUPP, "NOTSUPP" }, \
- { NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
- { NFS4ERR_NOT_SAME, "NOT_SAME" }, \
- { NFS4ERR_NO_GRACE, "NO_GRACE" }, \
- { NFS4ERR_NXIO, "NXIO" }, \
- { NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
- { NFS4ERR_OPENMODE, "OPENMODE" }, \
- { NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
- { NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
- { NFS4ERR_PERM, "PERM" }, \
- { NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
- { NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
- { NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
- { NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
- { NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
- { NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
- { NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
- { NFS4ERR_REP_TOO_BIG_TO_CACHE, \
- "REP_TOO_BIG_TO_CACHE" }, \
- { NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
- { NFS4ERR_RESOURCE, "RESOURCE" }, \
- { NFS4ERR_RESTOREFH, "RESTOREFH" }, \
- { NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
- { NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
- { NFS4ERR_ROFS, "ROFS" }, \
- { NFS4ERR_SAME, "SAME" }, \
- { NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
- { NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
- { NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
- { NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
- { NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
- { NFS4ERR_STALE, "STALE" }, \
- { NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
- { NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
- { NFS4ERR_SYMLINK, "SYMLINK" }, \
- { NFS4ERR_TOOSMALL, "TOOSMALL" }, \
- { NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
- { NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
- { NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
- { NFS4ERR_WRONGSEC, "WRONGSEC" }, \
- { NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
- { NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
- { NFS4ERR_XDEV, "XDEV" }, \
- /* ***** Internal to Linux NFS client ***** */ \
- { NFS4ERR_RESET_TO_MDS, "RESET_TO_MDS" }, \
- { NFS4ERR_RESET_TO_PNFS, "RESET_TO_PNFS" })
-
-#define show_open_flags(flags) \
- __print_flags(flags, "|", \
- { O_CREAT, "O_CREAT" }, \
- { O_EXCL, "O_EXCL" }, \
- { O_TRUNC, "O_TRUNC" }, \
- { O_DIRECT, "O_DIRECT" })
-
-#define show_fmode_flags(mode) \
- __print_flags(mode, "|", \
- { ((__force unsigned long)FMODE_READ), "READ" }, \
- { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
- { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+#include <trace/events/fs.h>
+#include <trace/events/nfs.h>
#define show_nfs_fattr_flags(valid) \
__print_flags((unsigned long)valid, "|", \
@@ -365,7 +53,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_printk(
"error=%ld (%s) dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__get_str(dstaddr)
)
);
@@ -389,29 +77,6 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
-#define show_nfs4_sequence_status_flags(status) \
- __print_flags((unsigned long)status, "|", \
- { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
- { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
- "CB_GSS_CONTEXTS_EXPIRING" }, \
- { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
- "CB_GSS_CONTEXTS_EXPIRED" }, \
- { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
- "EXPIRED_ALL_STATE_REVOKED" }, \
- { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
- "EXPIRED_SOME_STATE_REVOKED" }, \
- { SEQ4_STATUS_ADMIN_STATE_REVOKED, \
- "ADMIN_STATE_REVOKED" }, \
- { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \
- "RECALLABLE_STATE_REVOKED" }, \
- { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
- { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
- "RESTART_RECLAIM_NEEDED" }, \
- { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
- "CB_PATH_DOWN_SESSION" }, \
- { SEQ4_STATUS_BACKCHANNEL_FAULT, \
- "BACKCHANNEL_FAULT" })
-
TRACE_EVENT(nfs4_sequence_done,
TP_PROTO(
const struct nfs4_session *session,
@@ -425,7 +90,7 @@ TRACE_EVENT(nfs4_sequence_done,
__field(unsigned int, seq_nr)
__field(unsigned int, highest_slotid)
__field(unsigned int, target_highest_slotid)
- __field(unsigned int, status_flags)
+ __field(unsigned long, status_flags)
__field(unsigned long, error)
),
@@ -444,16 +109,16 @@ TRACE_EVENT(nfs4_sequence_done,
TP_printk(
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u target_highest_slotid=%u "
- "status_flags=%u (%s)",
+ "status_flags=0x%lx (%s)",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
__entry->highest_slotid,
__entry->target_highest_slotid,
__entry->status_flags,
- show_nfs4_sequence_status_flags(__entry->status_flags)
+ show_nfs4_seq4_status(__entry->status_flags)
)
);
@@ -490,7 +155,7 @@ TRACE_EVENT(nfs4_cb_sequence,
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
@@ -527,7 +192,7 @@ TRACE_EVENT(nfs4_cb_seqid_err,
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
@@ -535,6 +200,49 @@ TRACE_EVENT(nfs4_cb_seqid_err,
)
);
+TRACE_EVENT(nfs4_cb_offload,
+ TP_PROTO(
+ const struct nfs_fh *cb_fh,
+ const nfs4_stateid *cb_stateid,
+ uint64_t cb_count,
+ int cb_error,
+ int cb_how_stable
+ ),
+
+ TP_ARGS(cb_fh, cb_stateid, cb_count, cb_error,
+ cb_how_stable),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(loff_t, cb_count)
+ __field(int, cb_how)
+ __field(int, cb_stateid_seq)
+ __field(u32, cb_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = cb_error < 0 ? -cb_error : 0;
+ __entry->fhandle = nfs_fhandle_hash(cb_fh);
+ __entry->cb_stateid_seq =
+ be32_to_cpu(cb_stateid->seqid);
+ __entry->cb_stateid_hash =
+ nfs_stateid_hash(cb_stateid);
+ __entry->cb_count = cb_count;
+ __entry->cb_how = cb_how_stable;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x cb_stateid=%d:0x%08x "
+ "cb_count=%llu cb_how=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->cb_stateid_seq, __entry->cb_stateid_hash,
+ __entry->cb_count,
+ show_nfs_stable_how(__entry->cb_how)
+ )
+);
#endif /* CONFIG_NFS_V4_1 */
TRACE_EVENT(nfs4_setup_sequence,
@@ -661,7 +369,7 @@ TRACE_EVENT(nfs4_state_mgr_failed,
"hostname=%s clp state=%s error=%ld (%s) section=%s",
__get_str(hostname),
show_nfs4_clp_state(__entry->state), -__entry->error,
- show_nfsv4_errors(__entry->error), __get_str(section)
+ show_nfs4_status(__entry->error), __get_str(section)
)
)
@@ -694,8 +402,8 @@ TRACE_EVENT(nfs4_xdr_bad_operation,
__entry->expected = expected;
),
- TP_printk(
- "task:%u@%d xid=0x%08x operation=%u, expected=%u",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x operation=%u, expected=%u",
__entry->task_id, __entry->client_id, __entry->xid,
__entry->op, __entry->expected
)
@@ -729,10 +437,10 @@ DECLARE_EVENT_CLASS(nfs4_xdr_event,
__entry->error = error;
),
- TP_printk(
- "task:%u@%d xid=0x%08x error=%ld (%s) operation=%u",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x error=%ld (%s) operation=%u",
__entry->task_id, __entry->client_id, __entry->xid,
- -__entry->error, show_nfsv4_errors(__entry->error),
+ -__entry->error, show_nfs4_status(__entry->error),
__entry->op
)
);
@@ -793,8 +501,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(unsigned int, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, flags)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -812,7 +520,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->error = -error;
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
@@ -842,15 +550,15 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
),
TP_printk(
- "error=%ld (%s) flags=%d (%s) fmode=%s "
+ "error=%ld (%s) flags=%lu (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
"openstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -904,7 +612,7 @@ TRACE_EVENT(nfs4_cached_open,
TP_printk(
"fmode=%s fileid=%02x:%02x:%llu "
"fhandle=0x%08x stateid=%d:0x%08x",
- __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
@@ -951,8 +659,8 @@ TRACE_EVENT(nfs4_close,
"error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu "
"fhandle=0x%08x openstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ show_nfs4_status(__entry->error),
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
@@ -961,24 +669,6 @@ TRACE_EVENT(nfs4_close,
)
);
-TRACE_DEFINE_ENUM(F_GETLK);
-TRACE_DEFINE_ENUM(F_SETLK);
-TRACE_DEFINE_ENUM(F_SETLKW);
-TRACE_DEFINE_ENUM(F_RDLCK);
-TRACE_DEFINE_ENUM(F_WRLCK);
-TRACE_DEFINE_ENUM(F_UNLCK);
-
-#define show_lock_cmd(type) \
- __print_symbolic((int)type, \
- { F_GETLK, "GETLK" }, \
- { F_SETLK, "SETLK" }, \
- { F_SETLKW, "SETLKW" })
-#define show_lock_type(type) \
- __print_symbolic((int)type, \
- { F_RDLCK, "RDLCK" }, \
- { F_WRLCK, "WRLCK" }, \
- { F_UNLCK, "UNLCK" })
-
DECLARE_EVENT_CLASS(nfs4_lock_event,
TP_PROTO(
const struct file_lock *request,
@@ -991,8 +681,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(int, cmd)
- __field(char, type)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
__field(loff_t, start)
__field(loff_t, end)
__field(dev_t, dev)
@@ -1024,9 +714,9 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- show_lock_cmd(__entry->cmd),
- show_lock_type(__entry->type),
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
(long long)__entry->start,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1061,8 +751,8 @@ TRACE_EVENT(nfs4_set_lock,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(int, cmd)
- __field(char, type)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
__field(loff_t, start)
__field(loff_t, end)
__field(dev_t, dev)
@@ -1100,9 +790,9 @@ TRACE_EVENT(nfs4_set_lock,
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x lockstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- show_lock_cmd(__entry->cmd),
- show_lock_type(__entry->type),
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
(long long)__entry->start,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1219,7 +909,7 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_printk(
"fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
- show_fmode_flags(__entry->fmode),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle
@@ -1266,7 +956,7 @@ TRACE_EVENT(nfs4_delegreturn_exit,
"error=%ld (%s) dev=%02x:%02x fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->fhandle,
__entry->stateid_seq, __entry->stateid_hash
@@ -1309,7 +999,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1356,7 +1046,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -1403,7 +1093,7 @@ TRACE_EVENT(nfs4_lookupp,
TP_printk(
"error=%ld (%s) inode=%02x:%02x:%llu",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->ino
)
@@ -1442,7 +1132,7 @@ TRACE_EVENT(nfs4_rename,
"error=%ld (%s) oldname=%02x:%02x:%llu/%s "
"newname=%02x:%02x:%llu/%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->olddir,
__get_str(oldname),
@@ -1477,7 +1167,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle
@@ -1535,7 +1225,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1588,7 +1278,7 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"valid=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1644,7 +1334,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1705,7 +1395,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1754,7 +1444,7 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_printk(
"error=%ld (%s) id=%u name=%s",
- -__entry->error, show_nfsv4_errors(__entry->error),
+ -__entry->error, show_nfs4_status(__entry->error),
__entry->id,
__get_str(name)
)
@@ -1832,7 +1522,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
"offset=%lld count=%u res=%u stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1906,7 +1596,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
"offset=%lld count=%u res=%u stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1970,7 +1660,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%lld count=%u layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1990,16 +1680,6 @@ DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
-TRACE_DEFINE_ENUM(IOMODE_READ);
-TRACE_DEFINE_ENUM(IOMODE_RW);
-TRACE_DEFINE_ENUM(IOMODE_ANY);
-
-#define show_pnfs_iomode(iomode) \
- __print_symbolic(iomode, \
- { IOMODE_READ, "READ" }, \
- { IOMODE_RW, "RW" }, \
- { IOMODE_ANY, "ANY" })
-
TRACE_EVENT(nfs4_layoutget,
TP_PROTO(
const struct nfs_open_context *ctx,
@@ -2055,11 +1735,11 @@ TRACE_EVENT(nfs4_layoutget,
"iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
(unsigned long long)__entry->count,
__entry->stateid_seq, __entry->stateid_hash,
@@ -2153,7 +1833,7 @@ TRACE_EVENT(pnfs_update_layout,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
@@ -2207,7 +1887,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
@@ -2352,7 +2032,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -2408,7 +2088,7 @@ TRACE_EVENT(ff_layout_commit_error,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%llu count=%u dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -2417,6 +2097,406 @@ TRACE_EVENT(ff_layout_commit_error,
)
);
+TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
+TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
+
+#define show_llseek_mode(what) \
+ __print_symbolic(what, \
+ { NFS4_CONTENT_DATA, "DATA" }, \
+ { NFS4_CONTENT_HOLE, "HOLE" })
+
+#ifdef CONFIG_NFS_V4_2
+TRACE_EVENT(nfs4_llseek,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_seek_args *args,
+ const struct nfs42_seek_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(loff_t, offset_s)
+ __field(u32, what)
+ __field(loff_t, offset_r)
+ __field(u32, eof)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = args->sa_fh;
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset_s = args->sa_offset;
+ __entry->stateid_seq =
+ be32_to_cpu(args->sa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->sa_stateid);
+ __entry->what = args->sa_what;
+ if (error) {
+ __entry->error = -error;
+ __entry->offset_r = 0;
+ __entry->eof = 0;
+ } else {
+ __entry->error = 0;
+ __entry->offset_r = res->sr_offset;
+ __entry->eof = res->sr_eof;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset_s=%llu what=%s "
+ "offset_r=%llu eof=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->offset_s,
+ show_llseek_mode(__entry->what),
+ __entry->offset_r,
+ __entry->eof
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_sparse_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_falloc_args *args,
+ int error
+ ),
+
+ TP_ARGS(inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->offset = args->falloc_offset;
+ __entry->len = args->falloc_length;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(args->falloc_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->falloc_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ (long long)__entry->offset,
+ (long long)__entry->len
+ )
+);
+#define DEFINE_NFS4_SPARSE_EVENT(name) \
+ DEFINE_EVENT(nfs4_sparse_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const struct nfs42_falloc_args *args, \
+ int error \
+ ), \
+ TP_ARGS(inode, args, error))
+DEFINE_NFS4_SPARSE_EVENT(nfs4_fallocate);
+DEFINE_NFS4_SPARSE_EVENT(nfs4_deallocate);
+
+TRACE_EVENT(nfs4_copy,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_copy_args *args,
+ const struct nfs42_copy_res *res,
+ const struct nl4_server *nss,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, res, nss, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(bool, sync)
+ __field(loff_t, len)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ __field(loff_t, res_count)
+ __field(bool, res_sync)
+ __field(bool, res_cons)
+ __field(bool, intra)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_pos;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_pos;
+ __entry->len = args->count;
+ __entry->sync = args->sync;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ __entry->intra = nss ? 0 : 1;
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ __entry->res_count = 0;
+ __entry->res_sync = 0;
+ __entry->res_cons = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->write_res.stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->write_res.stateid);
+ __entry->res_count = res->write_res.count;
+ __entry->res_sync = res->synchronous;
+ __entry->res_cons = res->consecutive;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) intra=%d src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu dst_offset=%llu "
+ "len=%llu sync=%d cb_stateid=%d:0x%08x res_sync=%d "
+ "res_cons=%d res_count=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->intra,
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len,
+ __entry->sync,
+ __entry->res_stateid_seq, __entry->res_stateid_hash,
+ __entry->res_sync,
+ __entry->res_cons,
+ __entry->res_count
+ )
+);
+
+TRACE_EVENT(nfs4_clone,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_clone_args *args,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, len)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_offset;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_offset;
+ __entry->len = args->count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu "
+ "dst_offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len
+ )
+);
+
+TRACE_EVENT(nfs4_copy_notify,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_copy_notify_args *args,
+ const struct nfs42_copy_notify_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->cna_src_fh);
+ __entry->stateid_seq =
+ be32_to_cpu(args->cna_src_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->cna_src_stateid);
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->cnr_stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->cnr_stateid);
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x res_stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->res_stateid_seq, __entry->res_stateid_hash
+ )
+);
+
+TRACE_EVENT(nfs4_offload_cancel,
+ TP_PROTO(
+ const struct nfs42_offload_status_args *args,
+ int error
+ ),
+
+ TP_ARGS(args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->fhandle = nfs_fhandle_hash(args->osa_src_fh);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->osa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->osa_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+#endif /* CONFIG_NFS_V4_2 */
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a8cff19c6f00..86a5f6516928 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1605,7 +1605,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
- uint32_t dircount = readdir->count >> 1;
+ uint32_t dircount = readdir->count;
+ uint32_t maxcount = readdir->count;
__be32 *p, verf[2];
uint32_t attrlen = 0;
unsigned int i;
@@ -1618,7 +1619,6 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
- dircount >>= 1;
}
/* Use mounted_on_fileid only if the server supports it */
if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
@@ -1634,7 +1634,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
encode_nfs4_verifier(xdr, &readdir->verifier);
p = reserve_space(xdr, 12 + (attrlen << 2));
*p++ = cpu_to_be32(dircount);
- *p++ = cpu_to_be32(readdir->count);
+ *p++ = cpu_to_be32(maxcount);
*p++ = cpu_to_be32(attrlen);
for (i = 0; i < attrlen; i++)
*p++ = cpu_to_be32(attrs[i]);
@@ -3168,20 +3168,23 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
+ ssize_t ret;
+ void *ptr;
+ u32 tmp;
- p = xdr_inline_decode(xdr, 8);
- if (unlikely(!p))
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
return -EIO;
- hdr->status = be32_to_cpup(p++);
- hdr->taglen = be32_to_cpup(p);
+ hdr->status = tmp;
- p = xdr_inline_decode(xdr, hdr->taglen + 4);
- if (unlikely(!p))
+ ret = xdr_stream_decode_opaque_inline(xdr, &ptr, NFS4_OPAQUE_LIMIT);
+ if (ret < 0)
+ return -EIO;
+ hdr->taglen = ret;
+ hdr->tag = ptr;
+
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
return -EIO;
- hdr->tag = (char *)p;
- p += XDR_QUADLEN(hdr->taglen);
- hdr->nops = be32_to_cpup(p);
+ hdr->nops = tmp;
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
@@ -3530,6 +3533,42 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
return 0;
}
+static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE;
+ }
+ dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING;
+ }
+ dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
__be32 *p;
@@ -3693,8 +3732,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
if (unlikely(!p))
goto out_eio;
n = be32_to_cpup(p);
- if (n <= 0)
- goto out_eio;
for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
u32 m;
struct nfs4_fs_location *loc;
@@ -4197,10 +4234,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
} else
printk(KERN_WARNING "%s: label too long (%u)!\n",
__func__, len);
+ if (label && label->label)
+ dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n",
+ __func__, label->len, (char *)label->label,
+ label->len, label->pi, label->lfs);
}
- if (label && label->label)
- dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
- (char *)label->label, label->len, label->pi, label->lfs);
return status;
}
@@ -4409,6 +4447,10 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0)
+ goto xdr_error;
if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
res->exclcreat_bitmask)) != 0)
goto xdr_error;
@@ -4582,8 +4624,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
- const struct nfs_server *server)
+ struct nfs4_fs_locations *fs_loc, const struct nfs_server *server)
{
int status;
umode_t fmode = 0;
@@ -4698,8 +4739,8 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
- if (label) {
- status = decode_attr_security_label(xdr, bitmap, label);
+ if (fattr->label) {
+ status = decode_attr_security_label(xdr, bitmap, fattr->label);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -4712,7 +4753,7 @@ xdr_error:
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
- struct nfs4_label *label, const struct nfs_server *server)
+ const struct nfs_server *server)
{
unsigned int savep;
uint32_t attrlen,
@@ -4731,8 +4772,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
- label, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
if (status < 0)
goto xdr_error;
@@ -4742,16 +4782,10 @@ xdr_error:
return status;
}
-static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- struct nfs4_label *label, const struct nfs_server *server)
-{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
-}
-
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
}
/*
@@ -5572,20 +5606,9 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re
static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
{
- __be32 *p;
- uint32_t bitmap_words;
- unsigned int i;
-
- p = xdr_inline_decode(xdr, 4);
- if (!p)
- return -EIO;
- bitmap_words = be32_to_cpup(p++);
- if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
+ if (xdr_stream_decode_uint32_array(xdr, op_map->u.words,
+ ARRAY_SIZE(op_map->u.words)) < 0)
return -EIO;
- p = xdr_inline_decode(xdr, 4 * bitmap_words);
- for (i = 0; i < bitmap_words; i++)
- op_map->u.words[i] = be32_to_cpup(p++);
-
return 0;
}
@@ -6179,7 +6202,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6209,7 +6232,7 @@ static int nfs4_xdr_dec_lookupp(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6236,8 +6259,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr_label(xdr, res->fattr,
- res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6331,7 +6353,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6361,7 +6383,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6394,7 +6416,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6532,7 +6554,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (res->access_request)
decode_access(xdr, &res->access_supported, &res->access_result);
- decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
+ decode_getfattr(xdr, res->f_attr, res->server);
if (res->lg_res)
decode_layoutget(xdr, rqstp, res->lg_res);
out:
@@ -6616,7 +6638,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -7031,7 +7053,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
status = decode_getfattr_generic(xdr,
&res->fs_locations->fattr,
NULL, res->fs_locations,
- NULL, res->fs_locations->server);
+ res->fs_locations->server);
if (status)
goto out;
if (res->renew)
@@ -7044,7 +7066,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
status = decode_getfattr_generic(xdr,
&res->fs_locations->fattr,
NULL, res->fs_locations,
- NULL, res->fs_locations->server);
+ res->fs_locations->server);
}
out:
return status;
@@ -7475,7 +7497,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return -EAGAIN;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- NULL, entry->label, entry->server) < 0)
+ NULL, entry->server) < 0)
return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
@@ -7486,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
- entry->prev_cookie = entry->cookie;
entry->cookie = new_cookie;
return 0;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8a224871be74..012bd7339862 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -11,45 +11,9 @@
#include <linux/tracepoint.h>
#include <linux/iversion.h>
-TRACE_DEFINE_ENUM(DT_UNKNOWN);
-TRACE_DEFINE_ENUM(DT_FIFO);
-TRACE_DEFINE_ENUM(DT_CHR);
-TRACE_DEFINE_ENUM(DT_DIR);
-TRACE_DEFINE_ENUM(DT_BLK);
-TRACE_DEFINE_ENUM(DT_REG);
-TRACE_DEFINE_ENUM(DT_LNK);
-TRACE_DEFINE_ENUM(DT_SOCK);
-TRACE_DEFINE_ENUM(DT_WHT);
-
-#define nfs_show_file_type(ftype) \
- __print_symbolic(ftype, \
- { DT_UNKNOWN, "UNKNOWN" }, \
- { DT_FIFO, "FIFO" }, \
- { DT_CHR, "CHR" }, \
- { DT_DIR, "DIR" }, \
- { DT_BLK, "BLK" }, \
- { DT_REG, "REG" }, \
- { DT_LNK, "LNK" }, \
- { DT_SOCK, "SOCK" }, \
- { DT_WHT, "WHT" })
-
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_DATA);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ATIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACCESS);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACL);
-TRACE_DEFINE_ENUM(NFS_INO_REVAL_PAGECACHE);
-TRACE_DEFINE_ENUM(NFS_INO_REVAL_FORCED);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_LABEL);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_CHANGE);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
-TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
+#include <trace/events/fs.h>
+#include <trace/events/nfs.h>
+#include <trace/events/sunrpc_base.h>
#define nfs_show_cache_validity(v) \
__print_flags(v, "|", \
@@ -57,7 +21,6 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
- { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
{ NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
@@ -71,25 +34,12 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
{ NFS_INO_INVALID_MODE, "INVALID_MODE" })
-TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
-TRACE_DEFINE_ENUM(NFS_INO_STALE);
-TRACE_DEFINE_ENUM(NFS_INO_ACL_LRU_SET);
-TRACE_DEFINE_ENUM(NFS_INO_INVALIDATING);
-TRACE_DEFINE_ENUM(NFS_INO_FSCACHE);
-TRACE_DEFINE_ENUM(NFS_INO_FSCACHE_LOCK);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMIT);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMITTING);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTSTATS);
-TRACE_DEFINE_ENUM(NFS_INO_ODIRECT);
-
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
- { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
{ BIT(NFS_INO_STALE), "STALE" }, \
{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
{ BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
- { BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \
{ BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
{ BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
@@ -163,12 +113,12 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"type=%u (%s) version=%llu size=%lld "
"cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->type,
- nfs_show_file_type(__entry->type),
+ show_fs_dirent_type(__entry->type),
(unsigned long long)__entry->version,
(long long)__entry->size,
__entry->cache_validity,
@@ -209,6 +159,10 @@ DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
DEFINE_NFS_INODE_EVENT(nfs_access_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid);
+DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done);
TRACE_EVENT(nfs_access_exit,
TP_PROTO(
@@ -254,12 +208,12 @@ TRACE_EVENT(nfs_access_exit,
"type=%u (%s) version=%llu size=%lld "
"cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) "
"mask=0x%x permitted=0x%x",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->type,
- nfs_show_file_type(__entry->type),
+ show_fs_dirent_type(__entry->type),
(unsigned long long)__entry->version,
(long long)__entry->size,
__entry->cache_validity,
@@ -270,33 +224,171 @@ TRACE_EVENT(nfs_access_exit,
)
);
-TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
-TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
-TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
-TRACE_DEFINE_ENUM(LOOKUP_PARENT);
-TRACE_DEFINE_ENUM(LOOKUP_REVAL);
-TRACE_DEFINE_ENUM(LOOKUP_RCU);
-TRACE_DEFINE_ENUM(LOOKUP_OPEN);
-TRACE_DEFINE_ENUM(LOOKUP_CREATE);
-TRACE_DEFINE_ENUM(LOOKUP_EXCL);
-TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
-TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
-TRACE_DEFINE_ENUM(LOOKUP_DOWN);
-
-#define show_lookup_flags(flags) \
- __print_flags(flags, "|", \
- { LOOKUP_FOLLOW, "FOLLOW" }, \
- { LOOKUP_DIRECTORY, "DIRECTORY" }, \
- { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
- { LOOKUP_PARENT, "PARENT" }, \
- { LOOKUP_REVAL, "REVAL" }, \
- { LOOKUP_RCU, "RCU" }, \
- { LOOKUP_OPEN, "OPEN" }, \
- { LOOKUP_CREATE, "CREATE" }, \
- { LOOKUP_EXCL, "EXCL" }, \
- { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
- { LOOKUP_EMPTY, "EMPTY" }, \
- { LOOKUP_DOWN, "DOWN" })
+DECLARE_EVENT_CLASS(nfs_update_size_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t new_size
+ ),
+
+ TP_ARGS(inode, new_size),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, cur_size)
+ __field(loff_t, new_size)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->cur_size = i_size_read(inode);
+ __entry->new_size = new_size;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cursize=%lld newsize=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->cur_size, __entry->new_size
+ )
+);
+
+#define DEFINE_NFS_UPDATE_SIZE_EVENT(name) \
+ DEFINE_EVENT(nfs_update_size_class, nfs_size_##name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t new_size \
+ ), \
+ TP_ARGS(inode, new_size))
+
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
+DEFINE_NFS_UPDATE_SIZE_EVENT(update);
+DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
+
+DECLARE_EVENT_CLASS(nfs_inode_range_event,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t range_start,
+ loff_t range_end
+ ),
+
+ TP_ARGS(inode, range_start, range_end),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, range_start)
+ __field(loff_t, range_end)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->range_start = range_start;
+ __entry->range_end = range_end;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "range=[%lld, %lld]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->range_start, __entry->range_end
+ )
+);
+
+#define DEFINE_NFS_INODE_RANGE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_range_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t range_start, \
+ loff_t range_end \
+ ), \
+ TP_ARGS(inode, range_start, range_end))
+
+DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range);
+
+DECLARE_EVENT_CLASS(nfs_readdir_event,
+ TP_PROTO(
+ const struct file *file,
+ const __be32 *verifier,
+ u64 cookie,
+ pgoff_t page_index,
+ unsigned int dtsize
+ ),
+
+ TP_ARGS(file, verifier, cookie, page_index, dtsize),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ __field(u64, cookie)
+ __field(pgoff_t, index)
+ __field(unsigned int, dtsize)
+ ),
+
+ TP_fast_assign(
+ const struct inode *dir = file_inode(file);
+ const struct nfs_inode *nfsi = NFS_I(dir);
+
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(dir);
+ if (cookie != 0)
+ memcpy(__entry->verifier, verifier,
+ NFS4_VERIFIER_SIZE);
+ else
+ memset(__entry->verifier, 0,
+ NFS4_VERIFIER_SIZE);
+ __entry->cookie = cookie;
+ __entry->index = page_index;
+ __entry->dtsize = dtsize;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "cookie=%s:0x%llx cache_index=%lu dtsize=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->version, show_nfs4_verifier(__entry->verifier),
+ (unsigned long long)__entry->cookie, __entry->index,
+ __entry->dtsize
+ )
+);
+
+#define DEFINE_NFS_READDIR_EVENT(name) \
+ DEFINE_EVENT(nfs_readdir_event, name, \
+ TP_PROTO( \
+ const struct file *file, \
+ const __be32 *verifier, \
+ u64 cookie, \
+ pgoff_t page_index, \
+ unsigned int dtsize \
+ ), \
+ TP_ARGS(file, verifier, cookie, page_index, dtsize))
+
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill);
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached);
DECLARE_EVENT_CLASS(nfs_lookup_event,
TP_PROTO(
@@ -324,7 +416,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
TP_printk(
"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
__entry->flags,
- show_lookup_flags(__entry->flags),
+ show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -368,9 +460,9 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_lookup_flags(__entry->flags),
+ show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -391,46 +483,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
-
-TRACE_DEFINE_ENUM(O_WRONLY);
-TRACE_DEFINE_ENUM(O_RDWR);
-TRACE_DEFINE_ENUM(O_CREAT);
-TRACE_DEFINE_ENUM(O_EXCL);
-TRACE_DEFINE_ENUM(O_NOCTTY);
-TRACE_DEFINE_ENUM(O_TRUNC);
-TRACE_DEFINE_ENUM(O_APPEND);
-TRACE_DEFINE_ENUM(O_NONBLOCK);
-TRACE_DEFINE_ENUM(O_DSYNC);
-TRACE_DEFINE_ENUM(O_DIRECT);
-TRACE_DEFINE_ENUM(O_LARGEFILE);
-TRACE_DEFINE_ENUM(O_DIRECTORY);
-TRACE_DEFINE_ENUM(O_NOFOLLOW);
-TRACE_DEFINE_ENUM(O_NOATIME);
-TRACE_DEFINE_ENUM(O_CLOEXEC);
-
-#define show_open_flags(flags) \
- __print_flags(flags, "|", \
- { O_WRONLY, "O_WRONLY" }, \
- { O_RDWR, "O_RDWR" }, \
- { O_CREAT, "O_CREAT" }, \
- { O_EXCL, "O_EXCL" }, \
- { O_NOCTTY, "O_NOCTTY" }, \
- { O_TRUNC, "O_TRUNC" }, \
- { O_APPEND, "O_APPEND" }, \
- { O_NONBLOCK, "O_NONBLOCK" }, \
- { O_DSYNC, "O_DSYNC" }, \
- { O_DIRECT, "O_DIRECT" }, \
- { O_LARGEFILE, "O_LARGEFILE" }, \
- { O_DIRECTORY, "O_DIRECTORY" }, \
- { O_NOFOLLOW, "O_NOFOLLOW" }, \
- { O_NOATIME, "O_NOATIME" }, \
- { O_CLOEXEC, "O_CLOEXEC" })
-
-#define show_fmode_flags(mode) \
- __print_flags(mode, "|", \
- { ((__force unsigned long)FMODE_READ), "READ" }, \
- { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
- { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate);
TRACE_EVENT(nfs_atomic_open_enter,
TP_PROTO(
@@ -443,7 +498,7 @@ TRACE_EVENT(nfs_atomic_open_enter,
TP_STRUCT__entry(
__field(unsigned long, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
@@ -453,15 +508,15 @@ TRACE_EVENT(nfs_atomic_open_enter,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__assign_str(name, ctx->dentry->d_name.name);
),
TP_printk(
"flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s",
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -481,7 +536,7 @@ TRACE_EVENT(nfs_atomic_open_exit,
TP_STRUCT__entry(
__field(unsigned long, error)
__field(unsigned long, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
@@ -492,17 +547,17 @@ TRACE_EVENT(nfs_atomic_open_exit,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__assign_str(name, ctx->dentry->d_name.name);
),
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) fmode=%s "
"name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -535,7 +590,7 @@ TRACE_EVENT(nfs_create_enter,
TP_printk(
"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
__entry->flags,
- show_open_flags(__entry->flags),
+ show_fs_fcntl_open_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -570,9 +625,9 @@ TRACE_EVENT(nfs_create_exit,
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
+ show_fs_fcntl_open_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -640,7 +695,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -730,7 +785,7 @@ TRACE_EVENT(nfs_link_exit,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->fileid,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -817,7 +872,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
TP_printk(
"error=%ld (%s) old_name=%02x:%02x:%llu/%s "
"new_name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->old_dir,
__get_str(old_name),
@@ -871,13 +926,163 @@ TRACE_EVENT(nfs_sillyrename_unlink,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
)
);
+TRACE_EVENT(nfs_aop_readpage,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page
+ ),
+
+ TP_ARGS(inode, page),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset
+ )
+);
+
+TRACE_EVENT(nfs_aop_readpage_done,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page,
+ int ret
+ ),
+
+ TP_ARGS(inode, page, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->ret
+ )
+);
+
+TRACE_EVENT(nfs_aop_readahead,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t pos,
+ unsigned int nr_pages
+ ),
+
+ TP_ARGS(inode, pos, nr_pages),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = pos;
+ __entry->nr_pages = nr_pages;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld nr_pages=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->nr_pages
+ )
+);
+
+TRACE_EVENT(nfs_aop_readahead_done,
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int nr_pages,
+ int ret
+ ),
+
+ TP_ARGS(inode, nr_pages, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->nr_pages = nr_pages;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu nr_pages=%u ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->nr_pages, __entry->ret
+ )
+);
+
TRACE_EVENT(nfs_initiate_read,
TP_PROTO(
const struct nfs_pgio_header *hdr
@@ -1010,6 +1215,97 @@ TRACE_EVENT(nfs_readpage_short,
)
);
+DECLARE_EVENT_CLASS(nfs_fscache_page_event,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page
+ ),
+
+ TP_ARGS(inode, page),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset
+ )
+);
+DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page,
+ int error
+ ),
+
+ TP_ARGS(inode, page, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld error=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->error
+ )
+);
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
+ DEFINE_EVENT(nfs_fscache_page_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct page *page \
+ ), \
+ TP_ARGS(inode, page))
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_fscache_page_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct page *page, \
+ int error \
+ ), \
+ TP_ARGS(inode, page, error))
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
+
TRACE_EVENT(nfs_pgio_error,
TP_PROTO(
const struct nfs_pgio_header *hdr,
@@ -1054,16 +1350,6 @@ TRACE_EVENT(nfs_pgio_error,
)
);
-TRACE_DEFINE_ENUM(NFS_UNSTABLE);
-TRACE_DEFINE_ENUM(NFS_DATA_SYNC);
-TRACE_DEFINE_ENUM(NFS_FILE_SYNC);
-
-#define nfs_show_stable(stable) \
- __print_symbolic(stable, \
- { NFS_UNSTABLE, "UNSTABLE" }, \
- { NFS_DATA_SYNC, "DATA_SYNC" }, \
- { NFS_FILE_SYNC, "FILE_SYNC" })
-
TRACE_EVENT(nfs_initiate_write,
TP_PROTO(
const struct nfs_pgio_header *hdr
@@ -1077,7 +1363,7 @@ TRACE_EVENT(nfs_initiate_write,
__field(u64, fileid)
__field(loff_t, offset)
__field(u32, count)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
),
TP_fast_assign(
@@ -1101,7 +1387,7 @@ TRACE_EVENT(nfs_initiate_write,
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->count,
- nfs_show_stable(__entry->stable)
+ show_nfs_stable_how(__entry->stable)
)
);
@@ -1121,7 +1407,7 @@ TRACE_EVENT(nfs_writeback_done,
__field(u32, arg_count)
__field(u32, res_count)
__field(int, status)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1154,8 +1440,8 @@ TRACE_EVENT(nfs_writeback_done,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
__entry->res_count, __entry->status,
- nfs_show_stable(__entry->stable),
- __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
)
);
@@ -1256,7 +1542,7 @@ TRACE_EVENT(nfs_commit_done,
__field(u64, fileid)
__field(loff_t, offset)
__field(int, status)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1285,8 +1571,8 @@ TRACE_EVENT(nfs_commit_done,
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->status,
- nfs_show_stable(__entry->stable),
- __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
)
);
@@ -1323,76 +1609,6 @@ TRACE_EVENT(nfs_fh_to_dentry,
)
);
-TRACE_DEFINE_ENUM(NFS_OK);
-TRACE_DEFINE_ENUM(NFSERR_PERM);
-TRACE_DEFINE_ENUM(NFSERR_NOENT);
-TRACE_DEFINE_ENUM(NFSERR_IO);
-TRACE_DEFINE_ENUM(NFSERR_NXIO);
-TRACE_DEFINE_ENUM(ECHILD);
-TRACE_DEFINE_ENUM(NFSERR_EAGAIN);
-TRACE_DEFINE_ENUM(NFSERR_ACCES);
-TRACE_DEFINE_ENUM(NFSERR_EXIST);
-TRACE_DEFINE_ENUM(NFSERR_XDEV);
-TRACE_DEFINE_ENUM(NFSERR_NODEV);
-TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
-TRACE_DEFINE_ENUM(NFSERR_ISDIR);
-TRACE_DEFINE_ENUM(NFSERR_INVAL);
-TRACE_DEFINE_ENUM(NFSERR_FBIG);
-TRACE_DEFINE_ENUM(NFSERR_NOSPC);
-TRACE_DEFINE_ENUM(NFSERR_ROFS);
-TRACE_DEFINE_ENUM(NFSERR_MLINK);
-TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP);
-TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
-TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
-TRACE_DEFINE_ENUM(NFSERR_DQUOT);
-TRACE_DEFINE_ENUM(NFSERR_STALE);
-TRACE_DEFINE_ENUM(NFSERR_REMOTE);
-TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
-TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
-TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
-TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
-TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
-TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
-TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
-TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
-TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
-
-#define nfs_show_status(x) \
- __print_symbolic(x, \
- { NFS_OK, "OK" }, \
- { NFSERR_PERM, "PERM" }, \
- { NFSERR_NOENT, "NOENT" }, \
- { NFSERR_IO, "IO" }, \
- { NFSERR_NXIO, "NXIO" }, \
- { ECHILD, "CHILD" }, \
- { NFSERR_EAGAIN, "AGAIN" }, \
- { NFSERR_ACCES, "ACCES" }, \
- { NFSERR_EXIST, "EXIST" }, \
- { NFSERR_XDEV, "XDEV" }, \
- { NFSERR_NODEV, "NODEV" }, \
- { NFSERR_NOTDIR, "NOTDIR" }, \
- { NFSERR_ISDIR, "ISDIR" }, \
- { NFSERR_INVAL, "INVAL" }, \
- { NFSERR_FBIG, "FBIG" }, \
- { NFSERR_NOSPC, "NOSPC" }, \
- { NFSERR_ROFS, "ROFS" }, \
- { NFSERR_MLINK, "MLINK" }, \
- { NFSERR_OPNOTSUPP, "OPNOTSUPP" }, \
- { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
- { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
- { NFSERR_DQUOT, "DQUOT" }, \
- { NFSERR_STALE, "STALE" }, \
- { NFSERR_REMOTE, "REMOTE" }, \
- { NFSERR_WFLUSH, "WFLUSH" }, \
- { NFSERR_BADHANDLE, "BADHANDLE" }, \
- { NFSERR_NOT_SYNC, "NOTSYNC" }, \
- { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
- { NFSERR_NOTSUPP, "NOTSUPP" }, \
- { NFSERR_TOOSMALL, "TOOSMALL" }, \
- { NFSERR_SERVERFAULT, "REMOTEIO" }, \
- { NFSERR_BADTYPE, "BADTYPE" }, \
- { NFSERR_JUKEBOX, "JUKEBOX" })
-
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
@@ -1427,12 +1643,12 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
__assign_str(procedure, task->tk_msg.rpc_proc->p_name);
),
- TP_printk(
- "task:%u@%d xid=0x%08x %sv%d %s error=%ld (%s)",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x %sv%d %s error=%ld (%s)",
__entry->task_id, __entry->client_id, __entry->xid,
__get_str(program), __entry->version,
__get_str(procedure), -__entry->error,
- nfs_show_status(__entry->error)
+ show_nfs_status(__entry->error)
)
);
#define DEFINE_NFS_XDR_EVENT(name) \
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index cc232d1f16f2..9157dd19b8b4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
}
}
-static inline struct nfs_page *
-nfs_page_alloc(void)
+static inline struct nfs_page *nfs_page_alloc(void)
{
- struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+ struct nfs_page *p =
+ kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
if (p)
INIT_LIST_HEAD(&p->wb_list);
return p;
@@ -271,8 +271,7 @@ nfs_page_set_headlock(struct nfs_page *req)
void
nfs_page_clear_headlock(struct nfs_page *req)
{
- smp_mb__before_atomic();
- clear_bit(PG_HEADLOCK, &req->wb_flags);
+ clear_bit_unlock(PG_HEADLOCK, &req->wb_flags);
smp_mb__after_atomic();
if (!test_bit(PG_CONTENDED1, &req->wb_flags))
return;
@@ -525,12 +524,7 @@ nfs_create_subreq(struct nfs_page *req,
*/
void nfs_unlock_request(struct nfs_page *req)
{
- if (!NFS_WBACK_BUSY(req)) {
- printk(KERN_ERR "NFS: Invalid unlock attempted\n");
- BUG();
- }
- smp_mb__before_atomic();
- clear_bit(PG_BUSY, &req->wb_flags);
+ clear_bit_unlock(PG_BUSY, &req->wb_flags);
smp_mb__after_atomic();
if (!test_bit(PG_CONTENDED2, &req->wb_flags))
return;
@@ -870,9 +864,6 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
struct nfs_pgio_header *hdr = calldata;
struct inode *inode = hdr->inode;
- dprintk("NFS: %s: %5u, (status %d)\n", __func__,
- task->tk_pid, task->tk_status);
-
if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
return;
if (task->tk_status < 0)
@@ -901,7 +892,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_commit_info cinfo;
struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
- gfp_t gfp_flags = GFP_KERNEL;
+ gfp_t gfp_flags = nfs_io_gfp_mask();
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
pg_array->npages = pagecount;
@@ -988,7 +979,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
desc->pg_mirrors_dynamic = NULL;
if (mirror_count == 1)
return desc->pg_mirrors_static;
- ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
+ ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
if (ret != NULL) {
for (i = 0; i < mirror_count; i++)
nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
@@ -1227,6 +1218,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
do {
list_splice_init(&mirror->pg_list, &head);
+ mirror->pg_recoalesce = 0;
while (!list_empty(&head)) {
struct nfs_page *req;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7c9090a28e5c..856c962273c7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -92,6 +92,17 @@ find_pnfs_driver(u32 id)
return local;
}
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
@@ -1233,7 +1244,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
int status = 0;
*pcred = NULL;
- lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+ lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask());
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
@@ -2206,7 +2217,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
struct pnfs_layout_hdr *lo;
spin_lock(&ino->i_lock);
- lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
+ lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask());
if (!lo)
goto out_unlock;
if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
@@ -2249,8 +2260,8 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
lo = _pnfs_grab_empty_layout(ino, ctx);
if (!lo)
return;
- lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
- &rng, GFP_KERNEL);
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
if (!lgp) {
pnfs_clear_first_layoutget(lo);
nfs_layoutget_end(lo);
@@ -2275,8 +2286,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data,
};
struct nfs4_layoutget *lgp;
- lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
- &rng, GFP_KERNEL);
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
if (!lgp)
return;
data->lgp = lgp;
@@ -2691,13 +2702,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
else
rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- rd_size,
- IOMODE_READ,
- false,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), rd_size,
+ IOMODE_READ, false,
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -2718,13 +2727,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
pnfs_generic_pg_check_layout(pgio);
pnfs_generic_pg_check_range(pgio, req);
if (pgio->pg_lseg == NULL) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- wb_size,
- IOMODE_RW,
- false,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), wb_size, IOMODE_RW,
+ false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -3183,7 +3189,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = -ENOMEM;
/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
- data = kzalloc(sizeof(*data), GFP_NOFS);
+ data = kzalloc(sizeof(*data), nfs_io_gfp_mask());
if (!data)
goto clear_layoutcommitting;
@@ -3250,7 +3256,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
struct nfs4_threshold *thp;
- thp = kzalloc(sizeof(*thp), GFP_NOFS);
+ thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask());
if (!thp) {
dprintk("%s mdsthreshold allocation failed\n", __func__);
return NULL;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d810ae674f4e..07f11489e4e9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -82,10 +82,6 @@ enum pnfs_try_status {
PNFS_TRY_AGAIN = 2,
};
-/* error codes for internal use */
-#define NFS4ERR_RESET_TO_MDS 12001
-#define NFS4ERR_RESET_TO_PNFS 12002
-
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -238,6 +234,8 @@ struct pnfs_devicelist {
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
/* nfs4proc.c */
extern size_t max_response_pages(struct nfs_server *server);
@@ -517,7 +515,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (!lseg || !fl_cinfo->ops->mark_request_commit)
+ if (!lseg || !fl_cinfo->ops || !fl_cinfo->ops->mark_request_commit)
return false;
fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index cf19914fec81..657c242a18ff 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@ static struct nfs_commit_data *
pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
if (!data)
return NULL;
@@ -468,7 +468,6 @@ pnfs_bucket_alloc_ds_commits(struct list_head *list,
goto out_error;
data->ds_commit_index = i;
list_add_tail(&data->list, list);
- atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
@@ -516,11 +515,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
data->ds_commit_index = -1;
list_splice_init(mds_pages, &data->pages);
list_add_tail(&data->list, &list);
- atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
@@ -895,7 +897,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
@@ -973,7 +975,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ea19dbf12301..e3570c656b0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -91,7 +91,8 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
info->dtpref = fsinfo.tsize;
info->maxfilesize = 0x7FFFFFFF;
info->lease_time = 0;
- info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
+ info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ info->xattr_support = 0;
return 0;
}
@@ -100,8 +101,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -154,8 +154,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
@@ -257,7 +256,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply create: %d\n", status);
@@ -304,7 +303,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply mknod: %d\n", status);
@@ -436,7 +435,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* should fill in the data with a LOOKUP call on the wire.
*/
if (status == 0)
- status = nfs_instantiate(dentry, fh, fattr, NULL);
+ status = nfs_instantiate(dentry, fh, fattr);
out_free:
nfs_free_fattr(fattr);
@@ -465,7 +464,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 08d6cc57cbc3..5e7657374bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
struct address_space *mapping = page_file_mapping(page);
if (PageUptodate(page))
- nfs_readpage_to_fscache(inode, page, 0);
+ nfs_fscache_write_page(inode, page);
else if (!PageError(page) && !PagePrivate(page))
generic_error_remove_page(mapping, page);
unlock_page(page);
@@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
const struct nfs_rpc_ops *rpc_ops,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = hdr->inode;
- int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
-
- task_setup_data->flags |= swap_flags;
rpc_ops->read_setup(hdr, msg);
trace_nfs_initiate_read(hdr);
}
@@ -290,9 +286,8 @@ static void nfs_readpage_result(struct rpc_task *task,
}
static int
-readpage_async_filler(void *data, struct page *page)
+readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
{
- struct nfs_readdesc *desc = data;
struct inode *inode = page_file_mapping(page)->host;
unsigned int rsize = NFS_SERVER(inode)->rsize;
struct nfs_page *new;
@@ -305,6 +300,12 @@ readpage_async_filler(void *data, struct page *page)
aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
+ if (!IS_SYNC(page->mapping->host)) {
+ error = nfs_fscache_read_page(page->mapping->host, page);
+ if (error == 0)
+ goto out_unlock;
+ }
+
new = nfs_create_request(desc->ctx, page, 0, aligned_len);
if (IS_ERR(new))
goto out_error;
@@ -320,6 +321,7 @@ readpage_async_filler(void *data, struct page *page)
return 0;
out_error:
error = PTR_ERR(new);
+out_unlock:
unlock_page(page);
out:
return error;
@@ -337,8 +339,7 @@ int nfs_readpage(struct file *file, struct page *page)
struct inode *inode = page_file_mapping(page)->host;
int ret;
- dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_SIZE, page_index(page));
+ trace_nfs_aop_readpage(inode, page);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
/*
@@ -367,12 +368,6 @@ int nfs_readpage(struct file *file, struct page *page)
desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
xchg(&desc.ctx->error, 0);
- if (!IS_SYNC(inode)) {
- ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
- if (ret == 0)
- goto out_wait;
- }
-
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
@@ -382,7 +377,6 @@ int nfs_readpage(struct file *file, struct page *page)
nfs_pageio_complete_read(&desc.pgio);
ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
-out_wait:
if (!ret) {
ret = wait_on_page_locked_killable(page);
if (!PageUptodate(page) && !ret)
@@ -390,23 +384,24 @@ out_wait:
}
out:
put_nfs_open_context(desc.ctx);
+ trace_nfs_aop_readpage_done(inode, page, ret);
return ret;
out_unlock:
unlock_page(page);
+ trace_nfs_aop_readpage_done(inode, page, ret);
return ret;
}
-int nfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void nfs_readahead(struct readahead_control *ractl)
{
+ unsigned int nr_pages = readahead_count(ractl);
+ struct file *file = ractl->file;
struct nfs_readdesc desc;
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ struct page *page;
int ret;
- dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
- inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode),
- nr_pages);
+ trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
ret = -ESTALE;
@@ -421,25 +416,21 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
} else
desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
- /* attempt to read as many of the pages as possible from the cache
- * - this returns -ENOBUFS immediately if the cookie is negative
- */
- ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
- pages, &nr_pages);
- if (ret == 0)
- goto read_complete; /* all pages were read */
-
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
- ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+ while ((page = readahead_page(ractl)) != NULL) {
+ ret = readpage_async_filler(&desc, page);
+ put_page(page);
+ if (ret)
+ break;
+ }
nfs_pageio_complete_read(&desc.pgio);
-read_complete:
put_nfs_open_context(desc.ctx);
out:
- return ret;
+ trace_nfs_aop_readahead_done(inode, nr_pages, ret);
}
int __init nfs_init_readpagecache(void)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e65c83494c05..6ab5eeb000dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1004,6 +1004,7 @@ int nfs_reconfigure(struct fs_context *fc)
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct super_block *sb = fc->root->d_sb;
struct nfs_server *nfss = sb->s_fs_info;
+ int ret;
sync_filesystem(sb);
@@ -1028,7 +1029,11 @@ int nfs_reconfigure(struct fs_context *fc)
}
/* compare new mount options with old ones */
- return nfs_compare_remount_data(nfss, ctx);
+ ret = nfs_compare_remount_data(nfss, ctx);
+ if (ret)
+ return ret;
+
+ return nfs_probe_server(nfss, NFS_FH(d_inode(fc->root)));
}
EXPORT_SYMBOL_GPL(nfs_reconfigure);
@@ -1199,42 +1204,42 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc)
}
#ifdef CONFIG_NFS_FSCACHE
-static void nfs_get_cache_cookie(struct super_block *sb,
- struct nfs_fs_context *ctx)
+static int nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
{
struct nfs_server *nfss = NFS_SB(sb);
char *uniq = NULL;
int ulen = 0;
- nfss->fscache_key = NULL;
nfss->fscache = NULL;
if (!ctx)
- return;
+ return 0;
if (ctx->clone_data.sb) {
struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb);
if (!(mnt_s->options & NFS_OPTION_FSCACHE))
- return;
- if (mnt_s->fscache_key) {
- uniq = mnt_s->fscache_key->key.uniquifier;
- ulen = mnt_s->fscache_key->key.uniq_len;
+ return 0;
+ if (mnt_s->fscache_uniq) {
+ uniq = mnt_s->fscache_uniq;
+ ulen = strlen(uniq);
}
} else {
if (!(ctx->options & NFS_OPTION_FSCACHE))
- return;
+ return 0;
if (ctx->fscache_uniq) {
uniq = ctx->fscache_uniq;
ulen = strlen(ctx->fscache_uniq);
}
}
- nfs_fscache_get_super_cookie(sb, uniq, ulen);
+ return nfs_fscache_get_super_cookie(sb, uniq, ulen);
}
#else
-static void nfs_get_cache_cookie(struct super_block *sb,
- struct nfs_fs_context *ctx)
+static int nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
{
+ return 0;
}
#endif
@@ -1294,7 +1299,9 @@ int nfs_get_tree_common(struct fs_context *fc)
s->s_blocksize_bits = bsize;
s->s_blocksize = 1U << bsize;
}
- nfs_get_cache_cookie(s, ctx);
+ error = nfs_get_cache_cookie(s, ctx);
+ if (error < 0)
+ goto error_splat_super;
}
error = nfs_get_root(s, fc);
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 8cb70755e3c9..a6f740366963 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -142,10 +142,11 @@ static struct attribute *nfs_netns_client_attrs[] = {
&nfs_netns_client_id.attr,
NULL,
};
+ATTRIBUTE_GROUPS(nfs_netns_client);
static struct kobj_type nfs_netns_client_type = {
.release = nfs_netns_client_release,
- .default_attrs = nfs_netns_client_attrs,
+ .default_groups = nfs_netns_client_groups,
.sysfs_ops = &kobj_sysfs_ops,
.namespace = nfs_netns_client_namespace,
};
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 5fa11e1aca4c..6f325e10056c 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -347,6 +347,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return ERR_PTR(-ENOMEM);
+ task_setup_data.task = &data->task;
task_setup_data.callback_data = data;
data->cred = get_current_cred();
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index eae9bf114041..f00d45cf80ef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool;
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
{
struct nfs_commit_data *p;
- if (never_fail)
- p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
- else {
- /* It is OK to do some reclaim, not no safe to wait
- * for anything to be returned to the pool.
- * mempool_alloc() cannot handle that particular combination,
- * so we need two separate attempts.
- */
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
if (!p)
- p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
- __GFP_NOWARN | __GFP_NORETRY);
- if (!p)
return NULL;
+ memset(p, 0, sizeof(*p));
}
-
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
return p;
}
@@ -104,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
- struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
+ struct nfs_pgio_header *p;
- memset(p, 0, sizeof(*p));
+ p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
p->rw_mode = FMODE_WRITE;
return p;
}
@@ -288,11 +284,13 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
end = page_file_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
goto out;
+ trace_nfs_size_grow(inode, end);
i_size_write(inode, end);
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
out:
spin_unlock(&inode->i_lock);
+ nfs_fscache_invalidate(inode, 0);
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -304,7 +302,7 @@ static void nfs_set_pageerror(struct address_space *mapping)
/* Force file size revalidation */
spin_lock(&inode->i_lock);
nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED |
- NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_INVALID_CHANGE |
NFS_INO_INVALID_SIZE);
spin_unlock(&inode->i_lock);
}
@@ -314,7 +312,10 @@ static void nfs_mapping_set_error(struct page *page, int error)
struct address_space *mapping = page_file_mapping(page);
SetPageError(page);
- mapping_set_error(mapping, error);
+ filemap_set_wb_err(mapping, error);
+ if (mapping->host)
+ errseq_set(&mapping->host->i_sb->s_wb_err,
+ error == -ENOSPC ? -ENOSPC : -EIO);
nfs_set_pageerror(mapping);
}
@@ -415,7 +416,7 @@ static void nfs_set_page_writeback(struct page *page)
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 1;
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -431,7 +432,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
}
/*
@@ -670,6 +671,10 @@ static int nfs_writepage_locked(struct page *page,
struct inode *inode = page_file_mapping(page)->host;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
@@ -717,6 +722,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return 0;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
@@ -1038,25 +1047,11 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_page *req, *tmp;
int ret = 0;
-restart:
list_for_each_entry_safe(req, tmp, src, wb_list) {
kref_get(&req->wb_kref);
if (!nfs_lock_request(req)) {
- int status;
-
- /* Prevent deadlock with nfs_lock_and_join_requests */
- if (!list_empty(dst)) {
- nfs_release_request(req);
- continue;
- }
- /* Ensure we make progress to prevent livelock */
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- status = nfs_wait_on_request(req);
nfs_release_request(req);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- if (status < 0)
- break;
- goto restart;
+ continue;
}
nfs_request_remove_commit_list(req, cinfo);
clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -1246,7 +1241,7 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (nfs_ctx_key_to_expire(ctx, inode) &&
- !ctx->ll_cred)
+ !rcu_access_pointer(ctx->ll_cred))
/* Already expired! */
return -EACCES;
return 0;
@@ -1258,23 +1253,38 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
- struct rpc_cred *cred = ctx->ll_cred;
+ struct rpc_cred *cred, *new, *old = NULL;
struct auth_cred acred = {
.cred = ctx->cred,
};
+ bool ret = false;
- if (cred && !cred->cr_ops->crmatch(&acred, cred, 0)) {
- put_rpccred(cred);
- ctx->ll_cred = NULL;
- cred = NULL;
- }
- if (!cred)
- cred = auth->au_ops->lookup_cred(auth, &acred, 0);
- if (!cred || IS_ERR(cred))
+ rcu_read_lock();
+ cred = rcu_dereference(ctx->ll_cred);
+ if (cred && !(cred->cr_ops->crkey_timeout &&
+ cred->cr_ops->crkey_timeout(cred)))
+ goto out;
+ rcu_read_unlock();
+
+ new = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (new == cred) {
+ put_rpccred(new);
return true;
- ctx->ll_cred = cred;
- return !!(cred->cr_ops->crkey_timeout &&
- cred->cr_ops->crkey_timeout(cred));
+ }
+ if (IS_ERR_OR_NULL(new)) {
+ new = NULL;
+ ret = true;
+ } else if (new->cr_ops->crkey_timeout &&
+ new->cr_ops->crkey_timeout(new))
+ ret = true;
+
+ rcu_read_lock();
+ old = rcu_dereference_protected(xchg(&ctx->ll_cred,
+ RCU_INITIALIZER(new)), 1);
+out:
+ rcu_read_unlock();
+ put_rpccred(old);
+ return ret;
}
/*
@@ -1382,8 +1392,6 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
nfs_set_pageerror(mapping);
- else
- __set_page_dirty_nobuffers(page);
out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
@@ -1408,6 +1416,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
{
int priority = flush_task_priority(how);
+ if (IS_SWAPFILE(hdr->inode))
+ task_setup_data->flags |= RPC_TASK_SWAPPER;
task_setup_data->priority = priority;
rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
trace_nfs_initiate_write(hdr);
@@ -1671,10 +1681,13 @@ static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
+bool nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- if (atomic_dec_and_test(&cinfo->rpcs_out))
+ if (atomic_dec_and_test(&cinfo->rpcs_out)) {
wake_up_var(&cinfo->rpcs_out);
+ return true;
+ }
+ return false;
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1774,6 +1787,7 @@ void nfs_init_commit(struct nfs_commit_data *data,
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
+ nfs_commit_begin(cinfo->mds);
}
EXPORT_SYMBOL_GPL(nfs_init_commit);
@@ -1816,11 +1830,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
- atomic_inc(&cinfo->mds->rpcs_out);
if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
task_flags = RPC_TASK_MOVEABLE;
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
@@ -1835,9 +1852,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
{
struct nfs_commit_data *data = calldata;
- dprintk("NFS: %5u nfs_commit_done (status %d)\n",
- task->tk_pid, task->tk_status);
-
/* Call the NFS version-specific code */
NFS_PROTO(data->inode)->commit_done(task, data);
trace_nfs_commit_done(task, data);
@@ -1892,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
@@ -1936,6 +1950,7 @@ static int __nfs_commit_inode(struct inode *inode, int how,
int may_wait = how & FLUSH_SYNC;
int ret, nscan;
+ how &= ~FLUSH_SYNC;
nfs_init_cinfo_from_inode(&cinfo, inode);
nfs_commit_begin(cinfo.mds);
for (;;) {
@@ -2047,21 +2062,21 @@ out:
}
EXPORT_SYMBOL_GPL(nfs_wb_all);
-int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
{
struct nfs_page *req;
int ret = 0;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/* blocking call to cancel all requests and join to a single (head)
* request */
- req = nfs_lock_and_join_requests(page);
+ req = nfs_lock_and_join_requests(&folio->page);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
} else if (req) {
- /* all requests from this page have been cancelled by
+ /* all requests from this folio have been cancelled by
* nfs_lock_and_join_requests, so just remove the head
* request from the inode / page_private pointer and
* release it */
@@ -2124,8 +2139,11 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
if (PagePrivate(page))
return -EBUSY;
- if (!nfs_fscache_release_page(page, GFP_KERNEL))
- return -EBUSY;
+ if (PageFsCache(page)) {
+ if (mode == MIGRATE_ASYNC)
+ return -EBUSY;
+ wait_on_page_fscache(page);
+ }
return migrate_page(mapping, newpage, page, mode);
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 6e9ea4ee0f73..f6a2fd3015e7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -35,18 +35,9 @@ config NFSD_V2_ACL
bool
depends on NFSD
-config NFSD_V3
- bool "NFS server support for NFS version 3"
- depends on NFSD
- help
- This option enables support in your system's NFS server for
- version 3 of the NFS protocol (RFC 1813).
-
- If unsure, say Y.
-
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
- depends on NFSD_V3
+ depends on NFSD
select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
@@ -70,7 +61,6 @@ config NFSD_V3_ACL
config NFSD_V4
bool "NFS server support for NFS version 4"
depends on NFSD && PROC_FS
- select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
@@ -109,7 +99,6 @@ config NFSD_SCSILAYOUT
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
select EXPORTFS_BLOCK_OPS
- select SCSI_COMMON
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3f0983e93a99..805c06d5f1b4 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,9 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o \
- stats.o filecache.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
-nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c99dee99a3c1..b6d01d51a746 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -4,14 +4,10 @@
*/
#include <linux/exportfs.h>
#include <linux/iomap.h>
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
#include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -211,109 +207,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
#endif /* CONFIG_NFSD_BLOCKLAYOUT */
#ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
- struct pnfs_block_volume *b)
-{
- struct request_queue *q = bdev->bd_disk->queue;
- struct request *rq;
- struct scsi_request *req;
- /*
- * The allocation length (passed in bytes 3 and 4 of the INQUIRY
- * command descriptor block) specifies the number of bytes that have
- * been allocated for the data-in buffer.
- * 252 is the highest one-byte value that is a multiple of 4.
- * 65532 is the highest two-byte value that is a multiple of 4.
- */
- size_t bufflen = 252, maxlen = 65532, len, id_len;
- u8 *buf, *d, type, assoc;
- int retries = 1, error;
-
- if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
- return -EINVAL;
-
-again:
- buf = kzalloc(bufflen, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq)) {
- error = -ENOMEM;
- goto out_free_buf;
- }
- req = scsi_req(rq);
-
- error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
- if (error)
- goto out_put_request;
-
- req->cmd[0] = INQUIRY;
- req->cmd[1] = 1;
- req->cmd[2] = 0x83;
- req->cmd[3] = bufflen >> 8;
- req->cmd[4] = bufflen & 0xff;
- req->cmd_len = COMMAND_SIZE(INQUIRY);
-
- blk_execute_rq(NULL, rq, 1);
- if (req->result) {
- pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- req->result);
- error = -EIO;
- goto out_put_request;
- }
-
- len = (buf[2] << 8) + buf[3] + 4;
- if (len > bufflen) {
- if (len <= maxlen && retries--) {
- blk_put_request(rq);
- kfree(buf);
- bufflen = len;
- goto again;
- }
- pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
- len);
- goto out_put_request;
- }
-
- d = buf + 4;
- for (d = buf + 4; d < buf + len; d += id_len + 4) {
- id_len = d[3];
- type = d[1] & 0xf;
- assoc = (d[1] >> 4) & 0x3;
-
- /*
- * We only care about a EUI-64 and NAA designator types
- * with LU association.
- */
- if (assoc != 0x00)
- continue;
- if (type != 0x02 && type != 0x03)
- continue;
- if (id_len != 8 && id_len != 12 && id_len != 16)
- continue;
-
- b->scsi.code_set = PS_CODE_SET_BINARY;
- b->scsi.designator_type = type == 0x02 ?
- PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
- b->scsi.designator_len = id_len;
- memcpy(b->scsi.designator, d + 4, id_len);
-
- /*
- * If we found a 8 or 12 byte descriptor continue on to
- * see if a 16 byte one is available. If we find a
- * 16 byte descriptor we're done.
- */
- if (id_len == 16)
- break;
- }
-
-out_put_request:
- blk_put_request(rq);
-out_free_buf:
- kfree(buf);
- return error;
-}
-
#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
@@ -325,6 +218,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}
+static const u8 designator_types[] = {
+ PS_DESIGNATOR_EUI64,
+ PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+ u8 type = designator_types[i];
+
+ ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+ if (ret > 0) {
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type;
+ b->scsi.designator_len = ret;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct nfs4_client *clp,
@@ -333,7 +251,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
const struct pr_ops *ops;
- int error;
+ int ret;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +265,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
b->type = PNFS_BLOCK_VOLUME_SCSI;
b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
- error = nfsd4_scsi_identify_device(sb->s_bdev, b);
- if (error)
- return error;
+ ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+ if (ret < 0)
+ goto out_free_dev;
+ ret = -EINVAL;
ops = sb->s_bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: device %s does not support PRs.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
- if (error) {
+ ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (ret) {
pr_err("pNFS: failed to register key for device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
- if (error) {
+ if (ret) {
pr_err("pNFS: failed to reserve device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
return 0;
+
+out_free_dev:
+ kfree(dev);
+ return ret;
}
static __be32
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9421dae22737..668c7527b17e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -427,7 +427,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
return -EINVAL;
}
- if (mnt_user_ns(path->mnt) != &init_user_ns) {
+ if (is_idmapped_mnt(path->mnt)) {
dprintk("exp_export: export of idmapped mounts not yet supported.\n");
return -EINVAL;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index be3c1aad50ea..2c1b027774d4 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -7,6 +7,7 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/list_lru.h>
#include <linux/fsnotify_backend.h>
@@ -44,12 +45,9 @@ struct nfsd_fcache_bucket {
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
struct nfsd_fcache_disposal {
- struct list_head list;
struct work_struct work;
- struct net *net;
spinlock_t lock;
struct list_head freeme;
- struct rcu_head rcu;
};
static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
@@ -62,8 +60,6 @@ static long nfsd_file_lru_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
static atomic_long_t nfsd_filecache_count;
static struct delayed_work nfsd_filecache_laundrette;
-static DEFINE_SPINLOCK(laundrette_lock);
-static LIST_HEAD(laundrettes);
static void nfsd_file_gc(void);
@@ -194,7 +190,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
}
nf->nf_mark = NULL;
- init_rwsem(&nf->nf_rwsem);
trace_nfsd_file_alloc(nf);
}
return nf;
@@ -242,6 +237,13 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
}
static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static void
nfsd_file_do_unhash(struct nfsd_file *nf)
{
lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
@@ -249,7 +251,7 @@ nfsd_file_do_unhash(struct nfsd_file *nf)
trace_nfsd_file_unhash(nf);
if (nfsd_file_check_write_error(nf))
- nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
--nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
hlist_del_rcu(&nf->nf_node);
atomic_long_dec(&nfsd_filecache_count);
@@ -300,19 +302,15 @@ nfsd_file_put_noref(struct nfsd_file *nf)
void
nfsd_file_put(struct nfsd_file *nf)
{
- bool is_hashed;
-
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
+ nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
- return;
+ } else {
+ nfsd_file_put_noref(nf);
+ if (nf->nf_file)
+ nfsd_file_schedule_laundrette();
}
-
- filemap_flush(nf->nf_file->f_mapping);
- is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
- nfsd_file_put_noref(nf);
- if (is_hashed)
- nfsd_file_schedule_laundrette();
if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
nfsd_file_gc();
}
@@ -333,6 +331,7 @@ nfsd_file_dispose_list(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del(&nf->nf_lru);
+ nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
}
}
@@ -346,6 +345,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del(&nf->nf_lru);
+ nfsd_file_flush(nf);
if (!refcount_dec_and_test(&nf->nf_ref))
continue;
if (nfsd_file_free(nf))
@@ -367,19 +367,13 @@ nfsd_file_list_remove_disposal(struct list_head *dst,
static void
nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
{
- struct nfsd_fcache_disposal *l;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
- rcu_read_lock();
- list_for_each_entry_rcu(l, &laundrettes, list) {
- if (l->net == net) {
- spin_lock(&l->lock);
- list_splice_tail_init(files, &l->freeme);
- spin_unlock(&l->lock);
- queue_work(nfsd_filecache_wq, &l->work);
- break;
- }
- }
- rcu_read_unlock();
+ spin_lock(&l->lock);
+ list_splice_tail_init(files, &l->freeme);
+ spin_unlock(&l->lock);
+ queue_work(nfsd_filecache_wq, &l->work);
}
static void
@@ -602,6 +596,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
{
+ if (WARN_ON_ONCE(!inode))
+ return 0;
+
trace_nfsd_file_fsnotify_handle_event(inode, mask);
/* Should be no marks on non-regular files */
@@ -641,7 +638,7 @@ nfsd_file_cache_init(void)
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@ -709,7 +706,7 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
@@ -752,7 +749,7 @@ nfsd_file_cache_purge(struct net *net)
}
static struct nfsd_fcache_disposal *
-nfsd_alloc_fcache_disposal(struct net *net)
+nfsd_alloc_fcache_disposal(void)
{
struct nfsd_fcache_disposal *l;
@@ -760,7 +757,6 @@ nfsd_alloc_fcache_disposal(struct net *net)
if (!l)
return NULL;
INIT_WORK(&l->work, nfsd_file_delayed_close);
- l->net = net;
spin_lock_init(&l->lock);
INIT_LIST_HEAD(&l->freeme);
return l;
@@ -769,61 +765,27 @@ nfsd_alloc_fcache_disposal(struct net *net)
static void
nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
{
- rcu_assign_pointer(l->net, NULL);
cancel_work_sync(&l->work);
nfsd_file_dispose_list(&l->freeme);
- kfree_rcu(l, rcu);
-}
-
-static void
-nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l)
-{
- spin_lock(&laundrette_lock);
- list_add_tail_rcu(&l->list, &laundrettes);
- spin_unlock(&laundrette_lock);
-}
-
-static void
-nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l)
-{
- spin_lock(&laundrette_lock);
- list_del_rcu(&l->list);
- spin_unlock(&laundrette_lock);
-}
-
-static int
-nfsd_alloc_fcache_disposal_net(struct net *net)
-{
- struct nfsd_fcache_disposal *l;
-
- l = nfsd_alloc_fcache_disposal(net);
- if (!l)
- return -ENOMEM;
- nfsd_add_fcache_disposal(l);
- return 0;
+ kfree(l);
}
static void
nfsd_free_fcache_disposal_net(struct net *net)
{
- struct nfsd_fcache_disposal *l;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
- rcu_read_lock();
- list_for_each_entry_rcu(l, &laundrettes, list) {
- if (l->net != net)
- continue;
- nfsd_del_fcache_disposal(l);
- rcu_read_unlock();
- nfsd_free_fcache_disposal(l);
- return;
- }
- rcu_read_unlock();
+ nfsd_free_fcache_disposal(l);
}
int
nfsd_file_cache_start_net(struct net *net)
{
- return nfsd_alloc_fcache_disposal_net(net);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nn->fcache_disposal = nfsd_alloc_fcache_disposal();
+ return nn->fcache_disposal ? 0 : -ENOMEM;
}
void
@@ -855,7 +817,7 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 7872df5a0fe3..435ceab27897 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -46,7 +46,6 @@ struct nfsd_file {
refcount_t nf_ref;
unsigned char nf_may;
struct nfsd_file_mark *nf_mark;
- struct rw_semaphore nf_rwsem;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index db7ef07ae50c..070f90ed09b6 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -61,7 +61,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
goto out_error;
fl->fh.size = fhp->fh_handle.fh_size;
- memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size);
/* Give whole file layout segments */
seg->offset = 0;
@@ -117,7 +117,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
da->netaddr.addr_len =
snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
- "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+ "%s.%d.%d", addr, port >> 8, port & 0xff);
da->tightly_coupled = false;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 606fa155c28a..46a7f9b813e5 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -35,7 +35,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp,
/* must initialize before using! but maxsize doesn't matter */
fh_init(&fh,0);
fh.fh_handle.fh_size = f->size;
- memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+ memcpy(&fh.fh_handle.fh_raw, f->data, f->size);
fh.fh_export = NULL;
access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 935c1028c217..1b1a962a1804 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -11,6 +11,7 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <linux/percpu_counter.h>
+#include <linux/siphash.h>
/* Hash tables for nfs4_clientid state */
#define CLIENT_HASH_BITS 4
@@ -108,9 +109,8 @@ struct nfsd_net {
bool nfsd_net_up;
bool lockd_up;
- /* Time of server startup */
- struct timespec64 nfssvc_boot;
- seqlock_t boot_lock;
+ seqlock_t writeverf_lock;
+ unsigned char writeverf[8];
/*
* Max number of connections this nfsd container will allow. Defaults
@@ -123,12 +123,13 @@ struct nfsd_net {
u32 clverifier_counter;
struct svc_serv *nfsd_serv;
-
- wait_queue_head_t ntf_wq;
- atomic_t ntf_refcnt;
-
- /* Allow umount to wait for nfsd state cleanup */
- struct completion nfsd_shutdown_complete;
+ /* When a listening socket is added to nfsd, keep_active is set
+ * and this justifies a reference on nfsd_serv. This stops
+ * nfsd_serv from being freed. When the number of threads is
+ * set, keep_active is cleared and the reference is dropped. So
+ * when the last thread exits, the service will be destroyed.
+ */
+ int keep_active;
/*
* clientid and stateid data for construction of net unique COPY
@@ -184,6 +185,10 @@ struct nfsd_net {
/* utsname taken from the process that starts the server */
char nfsd_name[UNX_MAXNODENAME+1];
+
+ struct nfsd_fcache_disposal *fcache_disposal;
+
+ siphash_key_t siphash_key;
};
/* Simple check to find out if a given net was properly initialized */
@@ -193,6 +198,6 @@ extern void nfsd_netns_free_versions(struct nfsd_net *nn);
extern unsigned int nfsd_net_id;
-void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
-void nfsd_reset_boot_verifier(struct nfsd_net *nn);
+void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_write_verifier(struct nfsd_net *nn);
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4b43929c1f25..b5760801d377 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -188,51 +188,51 @@ out:
* XDR decode functions
*/
-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
if (argp->mask & ~NFS_ACL_MASK)
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
&argp->acl_access : NULL))
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
&argp->acl_default : NULL))
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_accessargs *args = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -240,63 +240,63 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETACL */
-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct inode *inode;
int w;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
if (dentry == NULL || d_really_is_negative(dentry))
- return 1;
+ return true;
inode = d_inode(dentry);
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
- return 0;
+ return false;
rqstp->rq_res.page_len = w = nfsacl_size(
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 1;
+ return true;
w -= PAGE_SIZE;
}
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
resp->mask & NFS_ACL, 0))
- return 0;
+ return false;
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
resp->mask & NFS_DFACL, NFS_ACL_DEFAULT))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* ACCESS */
-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->access) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
/*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 5dfe7644a517..35b2ebda14da 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -127,38 +127,38 @@ out:
* XDR decode functions
*/
-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
if (argp->mask & ~NFS_ACL_MASK)
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
&argp->acl_access : NULL))
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
&argp->acl_default : NULL))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -166,9 +166,9 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETACL */
-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct kvec *head = rqstp->rq_res.head;
@@ -178,14 +178,14 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
int w;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
inode = d_inode(dentry);
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
- return 0;
+ return false;
base = (char *)xdr->p - (char *)head->iov_base;
@@ -194,7 +194,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 0;
+ return false;
w -= PAGE_SIZE;
}
@@ -207,20 +207,20 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
resp->mask & NFS_DFACL,
NFS_ACL_DEFAULT);
if (n <= 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* SETACL */
-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_encode_setaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 17715a6c7a40..936eebd4c56d 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -150,13 +150,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
unsigned int len;
int v;
- argp->count = min_t(u32, argp->count, max_blocksize);
-
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
(unsigned long) argp->count,
(unsigned long long) argp->offset);
+ argp->count = min_t(u32, argp->count, max_blocksize);
+ if (argp->offset > (u64)OFFSET_MAX)
+ argp->offset = (u64)OFFSET_MAX;
+ if (argp->offset + argp->count > (u64)OFFSET_MAX)
+ argp->count = (u64)OFFSET_MAX - argp->offset;
+
v = 0;
len = argp->count;
resp->pages = rqstp->rq_next_page;
@@ -199,19 +203,19 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
(unsigned long long) argp->offset,
argp->stable? " stable" : "");
+ resp->status = nfserr_fbig;
+ if (argp->offset > (u64)OFFSET_MAX ||
+ argp->offset + argp->len > (u64)OFFSET_MAX)
+ return rpc_success;
+
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
- &argp->first, cnt);
- if (!nvecs) {
- resp->status = nfserr_io;
- goto out;
- }
+ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
+
resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
rqstp->rq_vec, nvecs, &cnt,
resp->committed, resp->verf);
resp->count = cnt;
-out:
return rpc_success;
}
@@ -439,22 +443,19 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
struct nfsd3_readdirres *resp,
- int count)
+ u32 count)
{
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
- count = min_t(u32, count, svc_max_payload(rqstp));
+ count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
buf->buflen = count - XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
- while (count > 0) {
- rqstp->rq_next_page++;
- count -= PAGE_SIZE;
- }
+ rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* This is xdr_init_encode(), but it assumes that
* the head kvec has already been consumed. */
@@ -463,7 +464,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
xdr->page_ptr = buf->pages;
xdr->iov = NULL;
xdr->p = page_address(*buf->pages);
- xdr->end = xdr->p + (PAGE_SIZE >> 2);
+ xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
xdr->rqst = NULL;
}
@@ -659,15 +660,9 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
argp->count,
(unsigned long long) argp->offset);
- if (argp->offset > NFS_OFFSET_MAX) {
- resp->status = nfserr_inval;
- goto out;
- }
-
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
argp->count, resp->verf);
-out:
return rpc_success;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0a5ebc52e6a9..0293b8d65f10 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -92,7 +92,7 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
return false;
fh_init(fhp, NFS3_FHSIZE);
fhp->fh_handle.fh_size = size;
- memcpy(&fhp->fh_handle.fh_base, p, size);
+ memcpy(&fhp->fh_handle.fh_raw, p, size);
return true;
}
@@ -131,7 +131,7 @@ svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
*p++ = cpu_to_be32(size);
if (size)
p[XDR_QUADLEN(size) - 1] = 0;
- memcpy(p, &fhp->fh_handle.fh_base, size);
+ memcpy(p, &fhp->fh_handle.fh_raw, size);
return true;
}
@@ -254,7 +254,7 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
if (xdr_stream_decode_u64(xdr, &newsize) < 0)
return false;
iap->ia_valid |= ATTR_SIZE;
- iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
+ iap->ia_size = newsize;
}
if (xdr_stream_decode_u32(xdr, &set_it) < 0)
return false;
@@ -487,88 +487,21 @@ neither:
return true;
}
-static bool fs_supports_change_attribute(struct super_block *sb)
-{
- return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion;
-}
-
-/*
- * Fill in the pre_op attr for the wcc data
- */
-void fill_pre_wcc(struct svc_fh *fhp)
-{
- struct inode *inode;
- struct kstat stat;
- bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
-
- if (fhp->fh_no_wcc || fhp->fh_pre_saved)
- return;
- inode = d_inode(fhp->fh_dentry);
- if (fs_supports_change_attribute(inode->i_sb) || !v4) {
- __be32 err = fh_getattr(fhp, &stat);
-
- if (err) {
- /* Grab the times from inode anyway */
- stat.mtime = inode->i_mtime;
- stat.ctime = inode->i_ctime;
- stat.size = inode->i_size;
- }
- fhp->fh_pre_mtime = stat.mtime;
- fhp->fh_pre_ctime = stat.ctime;
- fhp->fh_pre_size = stat.size;
- }
- if (v4)
- fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
-
- fhp->fh_pre_saved = true;
-}
-
-/*
- * Fill in the post_op attr for the wcc data
- */
-void fill_post_wcc(struct svc_fh *fhp)
-{
- bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
- struct inode *inode = d_inode(fhp->fh_dentry);
-
- if (fhp->fh_no_wcc)
- return;
-
- if (fhp->fh_post_saved)
- printk("nfsd: inode locked twice during operation.\n");
-
- fhp->fh_post_saved = true;
-
- if (fs_supports_change_attribute(inode->i_sb) || !v4) {
- __be32 err = fh_getattr(fhp, &fhp->fh_post_attr);
-
- if (err) {
- fhp->fh_post_saved = false;
- fhp->fh_post_attr.ctime = inode->i_ctime;
- }
- }
- if (v4)
- fhp->fh_post_change =
- nfsd4_change_attribute(&fhp->fh_post_attr, inode);
-}
-
/*
* XDR decode functions
*/
-int
-nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->fh);
}
-int
-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_sattrargs *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->fh) &&
@@ -576,96 +509,83 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattrguard3(xdr, args);
}
-int
-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_diropargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len);
}
-int
-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_accessargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_writeargs *args = rqstp->rq_argp;
u32 max_blocksize = svc_max_payload(rqstp);
- struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
- size_t remaining;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->stable) < 0)
- return 0;
+ return false;
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
- return 0;
+ return false;
/* request sanity */
if (args->count != args->len)
- return 0;
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->len))
- return 0;
+ return false;
if (args->count > max_blocksize) {
args->count = max_blocksize;
args->len = max_blocksize;
}
+ if (!xdr_stream_subsegment(xdr, &args->payload, args->count))
+ return false;
- args->first.iov_base = xdr->p;
- args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
-
- return 1;
+ return true;
}
-int
-nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->createmode) < 0)
- return 0;
+ return false;
switch (args->createmode) {
case NFS3_CREATE_UNCHECKED:
case NFS3_CREATE_GUARDED:
@@ -673,18 +593,17 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
case NFS3_CREATE_EXCLUSIVE:
args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
break;
default:
- return 0;
+ return false;
}
- return 1;
+ return true;
}
-int
-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->fh,
@@ -692,44 +611,42 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
}
-int
-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
struct kvec *tail = rqstp->rq_arg.tail;
size_t remaining;
if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
- return 0;
+ return false;
if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
- return 0;
+ return false;
/* request sanity */
remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
remaining -= xdr_stream_pos(xdr);
if (remaining < xdr_align_size(args->tlen))
- return 0;
+ return false;
args->first.iov_base = xdr->p;
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
- return 1;
+ return true;
}
-int
-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_mknodargs *args = rqstp->rq_argp;
if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->ftype) < 0)
- return 0;
+ return false;
switch (args->ftype) {
case NF3CHR:
case NF3BLK:
@@ -743,16 +660,15 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
/* Valid XDR but illegal file types */
break;
default:
- return 0;
+ return false;
}
- return 1;
+ return true;
}
-int
-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_renameargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->ffh,
@@ -761,10 +677,9 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_linkargs *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->ffh) &&
@@ -772,62 +687,59 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
- return 0;
+ return false;
args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
u32 dircount;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
- return 0;
+ return false;
args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
/* dircount is ignored */
if (xdr_stream_decode_u32(xdr, &dircount) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_commitargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -835,30 +747,28 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETATTR */
-int
-nfs3svc_encode_getattrres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime);
if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
/* SETATTR, REMOVE, RMDIR */
-int
-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -866,174 +776,168 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
}
/* LOOKUP */
-int nfs3svc_encode_lookupres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* ACCESS */
-int
-nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->access) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* READLINK */
-int
-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, 0, resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* READ */
-int
-nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->eof) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* WRITE */
-int
-nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_writeres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->committed) < 0)
- return 0;
+ return false;
if (!svcxdr_encode_writeverf3(xdr, resp->verf))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* CREATE, MKDIR, SYMLINK, MKNOD */
-int
-nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* RENAME */
-int
-nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_renameres *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -1042,10 +946,9 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
}
/* LINK */
-int
-nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_linkres *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -1054,34 +957,33 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
}
/* READDIR */
-int
-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
struct xdr_buf *dirlist = &resp->dirlist;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
- return 0;
+ return false;
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static __be32
@@ -1158,7 +1060,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
return false;
/* cookie */
resp->cookie_offset = dirlist->len;
- if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0)
+ if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0)
return false;
return true;
@@ -1308,27 +1210,26 @@ svcxdr_encode_fsstat3resok(struct xdr_stream *xdr,
}
/* FSSTAT */
-int
-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_fsstat3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static bool
@@ -1355,27 +1256,26 @@ svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr,
}
/* FSINFO */
-int
-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsinfores *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_fsinfo3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static bool
@@ -1398,51 +1298,49 @@ svcxdr_encode_pathconf3resok(struct xdr_stream *xdr,
}
/* PATHCONF */
-int
-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_pathconf3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* COMMIT */
-int
-nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_commitres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_writeverf3(xdr, resp->verf))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0f8b10f363e7..11f8715d92d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -121,7 +121,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
BUG_ON(length > NFS4_FHSIZE);
p = xdr_reserve_space(xdr, 4 + length);
- xdr_encode_opaque(p, &fh->fh_base, length);
+ xdr_encode_opaque(p, &fh->fh_raw, length);
}
/*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a97873f2d22b..2c05692a9abf 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
#ifdef CONFIG_NFSD_SCSILAYOUT
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
- sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
- blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+ sb->s_bdev &&
+ sb->s_bdev->bd_disk->fops->pr_ops &&
+ sb->s_bdev->bd_disk->fops->get_unique_id)
exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
@@ -421,7 +422,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
if (!new)
return nfserr_jukebox;
- memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ memcpy(&new->lo_seg, seg, sizeof(new->lo_seg));
new->lo_state = ls;
spin_lock(&fp->fi_lock);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 486c5dba4b65..b207c76a873f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -519,7 +519,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_put(&cstate->current_fh);
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
- memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+ memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
putfh->pf_fhlen);
ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
@@ -598,7 +598,7 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
- nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
+ nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id));
}
static __be32
@@ -782,12 +782,16 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
read->rd_nf = NULL;
- if (read->rd_offset >= OFFSET_MAX)
- return nfserr_inval;
trace_nfsd_read_start(rqstp, &cstate->current_fh,
read->rd_offset, read->rd_length);
+ read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp));
+ if (read->rd_offset > (u64)OFFSET_MAX)
+ read->rd_offset = (u64)OFFSET_MAX;
+ if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX)
+ read->rd_length = (u64)OFFSET_MAX - read->rd_offset;
+
/*
* If we do a zero copy read, then a client will see read data
* that reflects the state of the file *after* performing the
@@ -1018,8 +1022,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned long cnt;
int nvecs;
- if (write->wr_offset >= OFFSET_MAX)
- return nfserr_inval;
+ if (write->wr_offset > (u64)OFFSET_MAX ||
+ write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
+ return nfserr_fbig;
cnt = write->wr_buflen;
trace_nfsd_write_start(rqstp, &cstate->current_fh,
@@ -1033,8 +1038,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_how_written = write->wr_stable_how;
- nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages,
- write->wr_payload.head, write->wr_buflen);
+ nvecs = svc_fill_write_vector(rqstp, &write->wr_payload);
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
@@ -1102,7 +1106,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+ status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos,
dst, clone->cl_dst_pos, clone->cl_count,
EX_ISSYNC(cstate->current_fh.fh_export));
@@ -1178,7 +1182,7 @@ extern void nfs_sb_deactive(struct super_block *sb);
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt)
{
- struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
struct nfsd4_ssc_umount_item *tmp;
DEFINE_WAIT(wait);
@@ -1383,7 +1387,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
s_fh = &cstate->save_fh;
copy->c_fh.size = s_fh->fh_handle.fh_size;
- memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
+ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size);
copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
sizeof(stateid_opaque_t));
@@ -1511,11 +1515,14 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
{
+ struct file *dst = copy->nf_dst->nf_file;
+ struct file *src = copy->nf_src->nf_file;
+ errseq_t since;
ssize_t bytes_copied = 0;
u64 bytes_total = copy->cp_count;
u64 src_pos = copy->cp_src_pos;
u64 dst_pos = copy->cp_dst_pos;
- __be32 status;
+ int status;
/* See RFC 7862 p.67: */
if (bytes_total == 0)
@@ -1523,9 +1530,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
do {
if (kthread_should_stop())
break;
- bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
- src_pos, copy->nf_dst->nf_file, dst_pos,
- bytes_total);
+ bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
+ bytes_total);
if (bytes_copied <= 0)
break;
bytes_total -= bytes_copied;
@@ -1535,11 +1541,11 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
} while (bytes_total > 0 && !copy->cp_synchronous);
/* for a non-zero asynchronous copy do a commit of data */
if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
- down_write(&copy->nf_dst->nf_rwsem);
- status = vfs_fsync_range(copy->nf_dst->nf_file,
- copy->cp_dst_pos,
+ since = READ_ONCE(dst->f_wb_err);
+ status = vfs_fsync_range(dst, copy->cp_dst_pos,
copy->cp_res.wr_bytes_written, 0);
- up_write(&copy->nf_dst->nf_rwsem);
+ if (!status)
+ status = filemap_check_wb_err(dst->f_mapping, since);
if (!status)
copy->committed = true;
}
@@ -2462,11 +2468,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
__be32 status;
resp->xdr = &rqstp->rq_res_stream;
+ resp->statusp = resp->xdr->p;
/* reserve space for: NFS status code */
xdr_reserve_space(resp->xdr, XDR_UNIT);
- resp->tagp = resp->xdr->p;
/* reserve space for: taglen, tag, and opcnt */
xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen);
resp->taglen = args->taglen;
@@ -2529,7 +2535,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
goto encode_op;
}
- fh_clear_wcc(current_fh);
+ fh_clear_pre_post_attrs(current_fh);
/* If op is non-idempotent */
if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 6fedc49726bf..c634483d85d2 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -2156,6 +2156,7 @@ static struct notifier_block nfsd4_cld_block = {
int
register_cld_notifier(void)
{
+ WARN_ON(!nfsd_net_id);
return rpc_pipefs_notifier_register(&nfsd4_cld_block);
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3f4027a5de88..234e852fcdfa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -246,6 +246,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
if (fh_match(fh, &cur->nbl_fh)) {
list_del_init(&cur->nbl_list);
+ WARN_ON(list_empty(&cur->nbl_lru));
list_del_init(&cur->nbl_lru);
found = cur;
break;
@@ -271,6 +272,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
INIT_LIST_HEAD(&nbl->nbl_lru);
fh_copy_shallow(&nbl->nbl_fh, fh);
locks_init_lock(&nbl->nbl_lock);
+ kref_init(&nbl->nbl_kref);
nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
&nfsd4_cb_notify_lock_ops,
NFSPROC4_CLNT_CB_NOTIFY_LOCK);
@@ -280,11 +282,20 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
}
static void
+free_nbl(struct kref *kref)
+{
+ struct nfsd4_blocked_lock *nbl;
+
+ nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+ kfree(nbl);
+}
+
+static void
free_blocked_lock(struct nfsd4_blocked_lock *nbl)
{
locks_delete_block(&nbl->nbl_lock);
locks_release_private(&nbl->nbl_lock);
- kfree(nbl);
+ kref_put(&nbl->nbl_kref, free_nbl);
}
static void
@@ -302,6 +313,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo)
struct nfsd4_blocked_lock,
nbl_list);
list_del_init(&nbl->nbl_list);
+ WARN_ON(list_empty(&nbl->nbl_lru));
list_move(&nbl->nbl_lru, &reaplist);
}
spin_unlock(&nn->blocked_locks_lock);
@@ -360,11 +372,13 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
* st_{access,deny}_bmap field of the stateid, in order to track not
* only what share bits are currently in force, but also what
* combinations of share bits previous opens have used. This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
+ * to enforce the recommendation in
+ * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that
+ * the server return an error if the client attempt to downgrade to a
+ * combination of share bits not explicable by closing some of its
+ * previous opens.
*
- * XXX: This enforcement is actually incomplete, since we don't keep
+ * This enforcement is arguably incomplete, since we don't keep
* track of access/deny bit combinations; so, e.g., we allow:
*
* OPEN allow read, deny write
@@ -372,6 +386,10 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
* DOWNGRADE allow read, deny none
*
* which we should reject.
+ *
+ * But you could also argue that our current code is already overkill,
+ * since it only exists to return NFS4ERR_INVAL on incorrect client
+ * behavior.
*/
static unsigned int
bmap_to_share_mode(unsigned long bmap)
@@ -1010,7 +1028,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
}
spin_unlock(&blocked_delegations_lock);
}
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
test_bit((hash>>8)&255, bd->set[0]) &&
test_bit((hash>>16)&255, bd->set[0]))
@@ -1029,7 +1047,7 @@ static void block_delegations(struct knfsd_fh *fh)
u32 hash;
struct bloom_pair *bd = &blocked_delegations;
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
@@ -1207,6 +1225,11 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
return 0;
}
+static bool delegation_hashed(struct nfs4_delegation *dp)
+{
+ return !(list_empty(&dp->dl_perfile));
+}
+
static bool
unhash_delegation_locked(struct nfs4_delegation *dp)
{
@@ -1214,7 +1237,7 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
lockdep_assert_held(&state_lock);
- if (list_empty(&dp->dl_perfile))
+ if (!delegation_hashed(dp))
return false;
dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
@@ -4107,8 +4130,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
if (client_has_state(old)
&& !same_creds(&unconf->cl_cred,
- &old->cl_cred))
+ &old->cl_cred)) {
+ old = NULL;
goto out;
+ }
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -4598,7 +4623,7 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
* queued for a lease break. Don't queue it again.
*/
spin_lock(&state_lock);
- if (dp->dl_time == 0) {
+ if (delegation_hashed(dp) && dp->dl_time == 0) {
dp->dl_time = ktime_get_boottime_seconds();
list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
}
@@ -4686,6 +4711,14 @@ nfsd_break_deleg_cb(struct file_lock *fl)
return ret;
}
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
static bool nfsd_breaker_owns_lease(struct file_lock *fl)
{
struct nfs4_delegation *dl = fl->fl_owner;
@@ -4693,11 +4726,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
struct nfs4_client *clp;
if (!i_am_nfsd())
- return NULL;
+ return false;
rqst = kthread_data(current);
/* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
- return NULL;
+ return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -5541,7 +5574,7 @@ static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn)
static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
{
bool do_wakeup = false;
- struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *tmp;
spin_lock(&nn->nfsd_ssc_lock);
@@ -6035,7 +6068,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
*nfp = NULL;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
- status = check_special_stateids(net, fhp, stateid, flags);
+ if (cstid)
+ status = nfserr_bad_stateid;
+ else
+ status = check_special_stateids(net, fhp, stateid,
+ flags);
goto done;
}
@@ -6497,7 +6534,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
}
static fl_owner_t
-nfsd4_fl_get_owner(fl_owner_t owner)
+nfsd4_lm_get_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6506,7 +6543,7 @@ nfsd4_fl_get_owner(fl_owner_t owner)
}
static void
-nfsd4_fl_put_owner(fl_owner_t owner)
+nfsd4_lm_put_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6541,8 +6578,8 @@ nfsd4_lm_notify(struct file_lock *fl)
static const struct lock_manager_operations nfsd_posix_mng_ops = {
.lm_notify = nfsd4_lm_notify,
- .lm_get_owner = nfsd4_fl_get_owner,
- .lm_put_owner = nfsd4_fl_put_owner,
+ .lm_get_owner = nfsd4_lm_get_owner,
+ .lm_put_owner = nfsd4_lm_put_owner,
};
static inline void
@@ -6831,7 +6868,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
- struct super_block *sb;
__be32 status = 0;
int lkflg;
int err;
@@ -6853,7 +6889,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
- sb = cstate->current_fh.fh_dentry->d_sb;
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -6905,8 +6940,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate) &&
- !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS))
+ if (nfsd4_has_session(cstate))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
@@ -6918,8 +6952,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fl_type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate) &&
- !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS))
+ if (nfsd4_has_session(cstate))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
@@ -6940,6 +6973,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
+ /*
+ * Most filesystems with their own ->lock operations will block
+ * the nfsd thread waiting to acquire the lock. That leads to
+ * deadlocks (we don't want every nfsd thread tied up waiting
+ * for file locks), so don't attempt blocking lock notifications
+ * on those filesystems:
+ */
+ if (nf->nf_file->f_op->lock)
+ fl_flags &= ~FL_SLEEP;
+
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
dprintk("NFSD: %s: unable to allocate block!\n", __func__);
@@ -6970,6 +7013,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
spin_lock(&nn->blocked_locks_lock);
list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
+ kref_get(&nbl->nbl_kref);
spin_unlock(&nn->blocked_locks_lock);
}
@@ -6982,6 +7026,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nn->somebody_reclaimed = true;
break;
case FILE_LOCK_DEFERRED:
+ kref_put(&nbl->nbl_kref, free_nbl);
nbl = NULL;
fallthrough;
case -EAGAIN: /* conflock holds conflicting lock */
@@ -7002,8 +7047,13 @@ out:
/* dequeue it if we queued it before */
if (fl_flags & FL_SLEEP) {
spin_lock(&nn->blocked_locks_lock);
- list_del_init(&nbl->nbl_list);
- list_del_init(&nbl->nbl_lru);
+ if (!list_empty(&nbl->nbl_list) &&
+ !list_empty(&nbl->nbl_lru)) {
+ list_del_init(&nbl->nbl_list);
+ list_del_init(&nbl->nbl_lru);
+ kref_put(&nbl->nbl_kref, free_nbl);
+ }
+ /* nbl can use one of lists to be linked to reaplist */
spin_unlock(&nn->blocked_locks_lock);
}
free_blocked_lock(nbl);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index cf030ebe2827..da92e7d2ab6a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -277,24 +277,10 @@ nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf)
static __be32
nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen)
{
- u32 i, count;
- __be32 *p;
-
- if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
- return nfserr_bad_xdr;
- /* request sanity */
- if (count > 1000)
- return nfserr_bad_xdr;
- p = xdr_inline_decode(argp->xdr, count << 2);
- if (!p)
- return nfserr_bad_xdr;
- i = 0;
- while (i < count)
- bmval[i++] = be32_to_cpup(p++);
- while (i < bmlen)
- bmval[i++] = 0;
+ ssize_t status;
- return nfs_ok;
+ status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen);
+ return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok;
}
static __be32
@@ -2322,7 +2308,7 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
return true;
}
-static int
+static bool
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
struct nfsd4_op *op;
@@ -2335,25 +2321,25 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
int i;
if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0)
- return 0;
+ return false;
max_reply += XDR_UNIT;
argp->tag = NULL;
if (unlikely(argp->taglen)) {
if (argp->taglen > NFSD4_MAX_TAGLEN)
- return 0;
+ return false;
p = xdr_inline_decode(argp->xdr, argp->taglen);
if (!p)
- return 0;
+ return false;
argp->tag = svcxdr_savemem(argp, p, argp->taglen);
if (!argp->tag)
- return 0;
+ return false;
max_reply += xdr_align_size(argp->taglen);
}
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
- return 0;
+ return false;
/*
* NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
@@ -2361,14 +2347,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* nfsd4_proc can handle this is an NFS-level error.
*/
if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- return 1;
+ return true;
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
if (!argp->ops) {
argp->ops = argp->iops;
dprintk("nfsd: couldn't allocate room for COMPOUND\n");
- return 0;
+ return false;
}
}
@@ -2380,7 +2366,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op->replay = NULL;
if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0)
- return 0;
+ return false;
if (nfsd4_opnum_in_range(argp, op)) {
op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
if (op->status != nfs_ok)
@@ -2427,7 +2413,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
- return 1;
+ return true;
}
static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
@@ -2868,6 +2854,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
+ if (!(stat.result_mask & STATX_BTIME))
+ /* underlying FS does not offer btime so we can't share it */
+ bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
@@ -3110,7 +3099,7 @@ out_acl:
p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
if (!p)
goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
@@ -3268,6 +3257,13 @@ out_acl:
p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
+ if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
+ *p++ = cpu_to_be32(stat.btime.tv_nsec);
+ }
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
struct kstat parent_stat;
u64 ino = stat.ino;
@@ -3509,7 +3505,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
p = xdr_reserve_space(xdr, 3*4 + namlen);
if (!p)
goto fail;
- p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
+ p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
@@ -3670,7 +3666,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
p = xdr_reserve_space(xdr, len + 4);
if (!p)
return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
return 0;
}
@@ -4000,10 +3996,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
xdr_commit_encode(xdr);
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(unsigned long, maxcount,
+ maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- maxcount = min_t(unsigned long, maxcount, read->rd_length);
if (file->f_op->splice_read &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
@@ -4807,8 +4801,8 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
return nfserr_resource;
*p++ = htonl(NFS4_CONTENT_HOLE);
- p = xdr_encode_hyper(p, read->rd_offset);
- p = xdr_encode_hyper(p, count);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, count);
*eof = (read->rd_offset + count) >= f_size;
*maxcount = min_t(unsigned long, count, *maxcount);
@@ -4840,10 +4834,8 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr_resource;
xdr_commit_encode(xdr);
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(unsigned long, maxcount,
+ maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- maxcount = min_t(unsigned long, maxcount, read->rd_length);
count = maxcount;
eof = read->rd_offset >= i_size_read(file_inode(file));
@@ -5414,40 +5406,46 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
}
}
-int
-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundargs *args = rqstp->rq_argp;
/* svcxdr_tmp_alloc */
args->to_free = NULL;
- args->xdr = &rqstp->rq_arg_stream;
+ args->xdr = xdr;
args->ops = args->iops;
args->rqstp = rqstp;
return nfsd4_decode_compound(args);
}
-int
-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_buf *buf = resp->xdr->buf;
+ struct xdr_buf *buf = xdr->buf;
+ __be32 *p;
WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
buf->tail[0].iov_len);
- *p = resp->cstate.status;
+ /*
+ * Send buffer space for the following items is reserved
+ * at the top of nfsd4_proc_compound().
+ */
+ p = resp->statusp;
+
+ *p++ = resp->cstate.status;
- rqstp->rq_next_page = resp->xdr->page_ptr + 1;
+ rqstp->rq_next_page = xdr->page_ptr + 1;
- p = resp->tagp;
*p++ = htonl(resp->taglen);
memcpy(p, resp->tag, resp->taglen);
p += XDR_QUADLEN(resp->taglen);
*p++ = htonl(resp->opcnt);
nfsd4_sequence_done(resp);
- return 1;
+ return true;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 96cdf77925f3..0b3f12aa37ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
-static u32
-nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
-{
- return hash_32(be32_to_cpu(xid), nn->maskbits);
-}
-
static struct svc_cacherep *
nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
struct nfsd_net *nn)
@@ -241,8 +235,16 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
list_move_tail(&rp->c_lru, &b->lru_head);
}
-static long
-prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+static noinline struct nfsd_drc_bucket *
+nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
+{
+ unsigned int hash = hash_32((__force u32)xid, nn->maskbits);
+
+ return &nn->drc_hashtbl[hash];
+}
+
+static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
+ unsigned int max)
{
struct svc_cacherep *rp, *tmp;
long freed = 0;
@@ -258,11 +260,17 @@ prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(b, rp, nn);
- freed++;
+ if (max && freed++ > max)
+ break;
}
return freed;
}
+static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+{
+ return prune_bucket(b, nn, 3);
+}
+
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -279,7 +287,7 @@ prune_cache_entries(struct nfsd_net *nn)
if (list_empty(&b->lru_head))
continue;
spin_lock(&b->cache_lock);
- freed += prune_bucket(b, nn);
+ freed += prune_bucket(b, nn, 0);
spin_unlock(&b->cache_lock);
}
return freed;
@@ -413,12 +421,10 @@ out:
*/
int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd_net *nn;
struct svc_cacherep *rp, *found;
- __be32 xid = rqstp->rq_xid;
__wsum csum;
- u32 hash = nfsd_cache_hash(xid, nn);
- struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
+ struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -434,17 +440,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
+ nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
if (!rp)
goto out;
+ b = nfsd_cache_bucket_find(rqstp->rq_xid, nn);
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
- if (found != rp) {
- nfsd_reply_cache_free_locked(NULL, rp, nn);
- rp = found;
+ if (found != rp)
goto found_entry;
- }
nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
@@ -453,8 +458,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
atomic_inc(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
- /* go ahead and prune the cache */
- prune_bucket(b, nn);
+ nfsd_prune_bucket(b, nn);
out_unlock:
spin_unlock(&b->cache_lock);
@@ -463,8 +467,10 @@ out:
found_entry:
/* We found a matching entry which is either in progress or done. */
+ nfsd_reply_cache_free_locked(NULL, rp, nn);
nfsd_stats_rc_hits_inc();
rtn = RC_DROPIT;
+ rp = found;
/* Request being processed */
if (rp->c_state == RC_INPROG)
@@ -523,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
- u32 hash;
struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
@@ -531,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
- b = &nn->drc_hashtbl[hash];
+ b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn);
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 070e5dd03e26..16920e4512bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -395,12 +395,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
auth_domain_put(dom);
if (len)
return len;
-
+
mesg = buf;
len = SIMPLE_TRANSACTION_LIMIT;
- qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+ qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size);
mesg[-1] = '\n';
- return mesg - buf;
+ return mesg - buf;
}
/*
@@ -742,13 +742,12 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
return err;
err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
- if (err < 0) {
- nfsd_destroy(net);
- return err;
- }
- /* Decrease the count, but don't shut down the service */
- nn->nfsd_serv->sv_nrthreads--;
+ if (err >= 0 &&
+ !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+ svc_get(nn->nfsd_serv);
+
+ nfsd_put(net);
return err;
}
@@ -773,30 +772,29 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err != 0)
return err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
- /* Decrease the count, but don't shut down the service */
- nn->nfsd_serv->sv_nrthreads--;
+ if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+ svc_get(nn->nfsd_serv);
+
+ nfsd_put(net);
return 0;
out_close:
xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
- svc_close_xprt(xprt);
+ svc_xprt_close(xprt);
svc_xprt_put(xprt);
}
out_err:
- if (!list_empty(&nn->nfsd_serv->sv_permsocks))
- nn->nfsd_serv->sv_nrthreads--;
- else
- nfsd_destroy(net);
+ nfsd_put(net);
return err;
}
@@ -1249,7 +1247,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
clear_ncl(d_inode(dentry));
dget(dentry);
ret = simple_unlink(dir, dentry);
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_unlink(dir, dentry);
dput(dentry);
WARN_ON_ONCE(ret);
}
@@ -1340,8 +1339,8 @@ void nfsd_client_rmdir(struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
WARN_ON_ONCE(ret);
+ d_drop(dentry);
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
inode_unlock(dir);
}
@@ -1485,9 +1484,8 @@ static __net_init int nfsd_init_net(struct net *net)
nn->clientid_counter = nn->clientid_base + 1;
nn->s2s_cp_cl_id = nn->clientid_counter++;
- atomic_set(&nn->ntf_refcnt, 0);
- init_waitqueue_head(&nn->ntf_wq);
- seqlock_init(&nn->boot_lock);
+ get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
+ seqlock_init(&nn->writeverf_lock);
return 0;
@@ -1521,12 +1519,9 @@ static int __init init_nfsd(void)
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = register_cld_notifier();
- if (retval)
- return retval;
retval = nfsd4_init_slabs();
if (retval)
- goto out_unregister_notifier;
+ return retval;
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
@@ -1545,9 +1540,14 @@ static int __init init_nfsd(void)
goto out_free_exports;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
+ goto out_free_filesystem;
+ retval = register_cld_notifier();
+ if (retval)
goto out_free_all;
return 0;
out_free_all:
+ unregister_pernet_subsys(&nfsd_net_ops);
+out_free_filesystem:
unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
@@ -1561,13 +1561,12 @@ out_free_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
-out_unregister_notifier:
- unregister_cld_notifier();
return retval;
}
static void __exit exit_nfsd(void)
{
+ unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
remove_proc_entry("fs/nfs/exports", NULL);
@@ -1577,7 +1576,6 @@ static void __exit exit_nfsd(void)
nfsd4_free_slabs();
nfsd4_exit_pnfs();
unregister_filesystem(&nfsd_fs_type);
- unregister_cld_notifier();
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9664303afdaf..4fc1fd639527 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -78,8 +78,10 @@ extern const struct seq_operations nfs_exports_op;
*/
struct nfsd_voidargs { };
struct nfsd_voidres { };
-int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p);
-int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p);
+bool nfssvc_decode_voidarg(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr);
+bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr);
/*
* Function prototypes.
@@ -95,7 +97,7 @@ int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
void nfsd_shutdown_threads(struct net *net);
-void nfsd_destroy(struct net *net);
+void nfsd_put(struct net *net);
bool i_am_nfsd(void);
@@ -362,7 +364,7 @@ void nfsd_lockd_shutdown(void);
| FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
| FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
| FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
- | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \
| FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c475d2271f9c..c29baa03dfaf 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -154,11 +154,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
- struct fid *fid = NULL, sfid;
+ struct fid *fid = NULL;
struct svc_export *exp;
struct dentry *dentry;
int fileid_type;
int data_left = fh->fh_size/4;
+ int len;
__be32 error;
error = nfserr_stale;
@@ -167,48 +168,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers == 4 && fh->fh_size == 0)
return nfserr_nofilehandle;
- if (fh->fh_version == 1) {
- int len;
-
- if (--data_left < 0)
- return error;
- if (fh->fh_auth_type != 0)
- return error;
- len = key_len(fh->fh_fsid_type) / 4;
- if (len == 0)
- return error;
- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
- /* deprecated, convert to type 3 */
- len = key_len(FSID_ENCODE_DEV)/4;
- fh->fh_fsid_type = FSID_ENCODE_DEV;
- /*
- * struct knfsd_fh uses host-endian fields, which are
- * sometimes used to hold net-endian values. This
- * confuses sparse, so we must use __force here to
- * keep it from complaining.
- */
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
- }
- data_left -= len;
- if (data_left < 0)
- return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
- } else {
- __u32 tfh[2];
- dev_t xdev;
- ino_t xino;
-
- if (fh->fh_size != NFS_FHSIZE)
- return error;
- /* assume old filehandle format */
- xdev = old_decode_dev(fh->ofh_xdev);
- xino = u32_to_ino_t(fh->ofh_xino);
- mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
- exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+ if (fh->fh_version != 1)
+ return error;
+
+ if (--data_left < 0)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+ len = key_len(fh->fh_fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ /* deprecated, convert to type 3 */
+ len = key_len(FSID_ENCODE_DEV)/4;
+ fh->fh_fsid_type = FSID_ENCODE_DEV;
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
+ fh->fh_fsid[1] = fh->fh_fsid[2];
}
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);
error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -253,18 +241,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers > 2)
error = nfserr_badhandle;
- if (fh->fh_version != 1) {
- sfid.i32.ino = fh->ofh_ino;
- sfid.i32.gen = fh->ofh_generation;
- sfid.i32.parent_ino = fh->ofh_dirino;
- fid = &sfid;
- data_left = 3;
- if (fh->ofh_dirino == 0)
- fileid_type = FILEID_INO32_GEN;
- else
- fileid_type = FILEID_INO32_GEN_PARENT;
- } else
- fileid_type = fh->fh_fileid_type;
+ fileid_type = fh->fh_fileid_type;
if (fileid_type == FILEID_ROOT)
dentry = dget(exp->ex_path.dentry);
@@ -452,20 +429,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
}
}
-/*
- * for composing old style file handles
- */
-static inline void _fh_update_old(struct dentry *dentry,
- struct svc_export *exp,
- struct knfsd_fh *fh)
-{
- fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
- fh->ofh_generation = d_inode(dentry)->i_generation;
- if (d_is_dir(dentry) ||
- (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
- fh->ofh_dirino = 0;
-}
-
static bool is_root_export(struct svc_export *exp)
{
return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
@@ -562,9 +525,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
/* ref_fh is a reference file handle.
* if it is non-null and for the same filesystem, then we should compose
* a filehandle which is of the same version, where possible.
- * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
- * Then create a 32byte filehandle using nfs_fhbase_old
- *
*/
struct inode * inode = d_inode(dentry);
@@ -600,35 +560,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_dentry = dget(dentry); /* our internal copy */
fhp->fh_export = exp_get(exp);
- if (fhp->fh_handle.fh_version == 0xca) {
- /* old style filehandle please */
- memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
- fhp->fh_handle.fh_size = NFS_FHSIZE;
- fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
- fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
- fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
- fhp->fh_handle.ofh_xino =
- ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
- fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
- if (inode)
- _fh_update_old(dentry, exp, &fhp->fh_handle);
- } else {
- fhp->fh_handle.fh_size =
- key_len(fhp->fh_handle.fh_fsid_type) + 4;
- fhp->fh_handle.fh_auth_type = 0;
-
- mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
- ex_dev,
- d_inode(exp->ex_path.dentry)->i_ino,
- exp->ex_fsid, exp->ex_uuid);
-
- if (inode)
- _fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
- fh_put(fhp);
- return nfserr_opnotsupp;
- }
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
+ fhp->fh_handle.fh_auth_type = 0;
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
+ d_inode(exp->ex_path.dentry)->i_ino,
+ exp->ex_fsid, exp->ex_uuid);
+
+ if (inode)
+ _fh_update(fhp, exp, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+ fh_put(fhp);
+ return nfserr_opnotsupp;
}
return 0;
@@ -649,16 +595,12 @@ fh_update(struct svc_fh *fhp)
dentry = fhp->fh_dentry;
if (d_really_is_negative(dentry))
goto out_negative;
- if (fhp->fh_handle.fh_version != 1) {
- _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
- } else {
- if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
- return 0;
+ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+ return 0;
- _fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
- return nfserr_opnotsupp;
- }
+ _fh_update(fhp, fhp->fh_export, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+ return nfserr_opnotsupp;
return 0;
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
@@ -669,6 +611,66 @@ out_negative:
return nfserr_serverfault;
}
+/**
+ * fh_fill_pre_attrs - Fill in pre-op attributes
+ * @fhp: file handle to be updated
+ *
+ */
+void fh_fill_pre_attrs(struct svc_fh *fhp)
+{
+ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
+ struct inode *inode;
+ struct kstat stat;
+ __be32 err;
+
+ if (fhp->fh_no_wcc || fhp->fh_pre_saved)
+ return;
+
+ inode = d_inode(fhp->fh_dentry);
+ err = fh_getattr(fhp, &stat);
+ if (err) {
+ /* Grab the times from inode anyway */
+ stat.mtime = inode->i_mtime;
+ stat.ctime = inode->i_ctime;
+ stat.size = inode->i_size;
+ }
+ if (v4)
+ fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
+
+ fhp->fh_pre_mtime = stat.mtime;
+ fhp->fh_pre_ctime = stat.ctime;
+ fhp->fh_pre_size = stat.size;
+ fhp->fh_pre_saved = true;
+}
+
+/**
+ * fh_fill_post_attrs - Fill in post-op attributes
+ * @fhp: file handle to be updated
+ *
+ */
+void fh_fill_post_attrs(struct svc_fh *fhp)
+{
+ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ __be32 err;
+
+ if (fhp->fh_no_wcc)
+ return;
+
+ if (fhp->fh_post_saved)
+ printk("nfsd: inode locked twice during operation.\n");
+
+ err = fh_getattr(fhp, &fhp->fh_post_attr);
+ if (err) {
+ fhp->fh_post_saved = false;
+ fhp->fh_post_attr.ctime = inode->i_ctime;
+ } else
+ fhp->fh_post_saved = true;
+ if (v4)
+ fhp->fh_post_change =
+ nfsd4_change_attribute(&fhp->fh_post_attr, inode);
+}
+
/*
* Release a file handle.
*/
@@ -681,7 +683,7 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
- fh_clear_wcc(fhp);
+ fh_clear_pre_post_attrs(fhp);
}
fh_drop_write(fhp);
if (exp) {
@@ -698,16 +700,11 @@ fh_put(struct svc_fh *fhp)
char * SVCFH_fmt(struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
+ static char buf[2+1+1+64*3+1];
- static char buf[80];
- sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
- fh->fh_size,
- fh->fh_base.fh_pad[0],
- fh->fh_base.fh_pad[1],
- fh->fh_base.fh_pad[2],
- fh->fh_base.fh_pad[3],
- fh->fh_base.fh_pad[4],
- fh->fh_base.fh_pad[5]);
+ if (fh->fh_size < 0 || fh->fh_size> 64)
+ return "bad-fh";
+ sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw);
return buf;
}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6106697adc04..fb9d358a267e 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -10,9 +10,56 @@
#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+/*
+ * The file handle starts with a sequence of four-byte words.
+ * The first word contains a version number (1) and three descriptor bytes
+ * that tell how the remaining 3 variable length fields should be handled.
+ * These three bytes are auth_type, fsid_type and fileid_type.
+ *
+ * All four-byte values are in host-byte-order.
+ *
+ * The auth_type field is deprecated and must be set to 0.
+ *
+ * The fsid_type identifies how the filesystem (or export point) is
+ * encoded.
+ * Current values:
+ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+ * says we mustn't. We must break it up and reassemble.
+ * 1 - 4 byte user specified identifier
+ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+ * 4 - 4 byte inode number and 4 byte uuid
+ * 5 - 8 byte uuid
+ * 6 - 16 byte uuid
+ * 7 - 8 byte inode number and 16 byte uuid
+ *
+ * The fileid_type identifies how the file within the filesystem is encoded.
+ * The values for this field are filesystem specific, exccept that
+ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+ * in include/linux/exportfs.h for currently registered values.
+ */
+
+struct knfsd_fh {
+ unsigned int fh_size; /*
+ * Points to the current size while
+ * building a new file handle.
+ */
+ union {
+ char fh_raw[NFS4_FHSIZE];
+ struct {
+ u8 fh_version; /* == 1 */
+ u8 fh_auth_type; /* deprecated */
+ u8 fh_fsid_type;
+ u8 fh_fileid_type;
+ u32 fh_fsid[]; /* flexible-array member */
+ };
+ };
+};
static inline __u32 ino_t_to_u32(ino_t ino)
{
@@ -43,7 +90,6 @@ typedef struct svc_fh {
* operation
*/
int fh_flags; /* FH flags */
-#ifdef CONFIG_NFSD_V3
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
@@ -60,7 +106,6 @@ typedef struct svc_fh {
/* Post-op attributes saved in fh_unlock */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
-#endif /* CONFIG_NFSD_V3 */
} svc_fh;
#define NFSD4_FH_FOREIGN (1<<0)
#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
@@ -188,7 +233,7 @@ static inline void
fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
- memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+ memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
}
static __inline__ struct svc_fh *
@@ -203,7 +248,7 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
- if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
return false;
return true;
}
@@ -227,7 +272,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
*/
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
- return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+ return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
#else
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
@@ -236,13 +281,12 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
}
#endif
-#ifdef CONFIG_NFSD_V3
-/*
- * The wcc data stored in current_fh should be cleared
- * between compound ops.
+/**
+ * fh_clear_pre_post_attrs - Reset pre/post attributes
+ * @fhp: file handle to be updated
+ *
*/
-static inline void
-fh_clear_wcc(struct svc_fh *fhp)
+static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
{
fhp->fh_post_saved = false;
fhp->fh_pre_saved = false;
@@ -276,13 +320,8 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
return time_to_chattr(&stat->ctime);
}
-extern void fill_pre_wcc(struct svc_fh *fhp);
-extern void fill_post_wcc(struct svc_fh *fhp);
-#else
-#define fh_clear_wcc(ignored)
-#define fill_pre_wcc(ignored)
-#define fill_post_wcc(notused)
-#endif /* CONFIG_NFSD_V3 */
+extern void fh_fill_pre_attrs(struct svc_fh *fhp);
+extern void fh_fill_post_attrs(struct svc_fh *fhp);
/*
@@ -308,7 +347,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
inode = d_inode(dentry);
inode_lock_nested(inode, subclass);
- fill_pre_wcc(fhp);
+ fh_fill_pre_attrs(fhp);
fhp->fh_locked = true;
}
@@ -325,7 +364,7 @@ static inline void
fh_unlock(struct svc_fh *fhp)
{
if (fhp->fh_locked) {
- fill_post_wcc(fhp);
+ fh_fill_post_attrs(fhp);
inode_unlock(d_inode(fhp->fh_dentry));
fhp->fh_locked = false;
}
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 90fcd6178823..fcdab8a8a41f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -230,16 +230,11 @@ nfsd_proc_write(struct svc_rqst *rqstp)
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
- &argp->first, cnt);
- if (!nvecs) {
- resp->status = nfserr_io;
- goto out;
- }
+ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
argp->offset, rqstp->rq_vec, nvecs,
@@ -248,7 +243,6 @@ nfsd_proc_write(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
return rpc_drop_reply;
-out:
return rpc_success;
}
@@ -557,17 +551,17 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
struct nfsd_readdirres *resp,
- int count)
+ u32 count)
{
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
- count = min_t(u32, count, PAGE_SIZE);
+ count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
- buf->buflen = count - sizeof(__be32) * 2;
+ buf->buflen = count - XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
@@ -578,7 +572,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
xdr->page_ptr = buf->pages;
xdr->iov = NULL;
xdr->p = page_address(*buf->pages);
- xdr->end = xdr->p + (PAGE_SIZE >> 2);
+ xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
xdr->rqst = NULL;
}
@@ -851,6 +845,7 @@ nfserrno (int errno)
{ nfserr_io, -EIO },
{ nfserr_nxio, -ENXIO },
{ nfserr_fbig, -E2BIG },
+ { nfserr_stale, -EBADF },
{ nfserr_acces, -EACCES },
{ nfserr_exist, -EEXIST },
{ nfserr_xdev, -EXDEV },
@@ -879,6 +874,8 @@ nfserrno (int errno)
{ nfserr_toosmall, -ETOOSMALL },
{ nfserr_serverfault, -ESERVERFAULT },
{ nfserr_serverfault, -ENFILE },
+ { nfserr_io, -EREMOTEIO },
+ { nfserr_stale, -EOPENSTALE },
{ nfserr_io, -EUCLEAN },
{ nfserr_perm, -ENOKEY },
{ nfserr_no_grace, -ENOGRACE},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ccb59e91011b..4bb5baa17040 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
+#include <linux/siphash.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
@@ -55,18 +56,17 @@ static __be32 nfsd_init_request(struct svc_rqst *,
struct svc_process_info *);
/*
- * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
- * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
- * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members
+ * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks.
*
* If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
- * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
- * of nfsd threads must exist and each must listed in ->sp_all_threads in each
- * entry of ->sv_pools[].
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless
+ * nn->keep_active is set). That number of nfsd threads must
+ * exist and each must be listed in ->sp_all_threads in some entry of
+ * ->sv_pools[].
*
- * Transitions of the thread count between zero and non-zero are of particular
- * interest since the svc_serv needs to be created and initialized at that
- * point, or freed.
+ * Each active thread holds a counted reference on nn->nfsd_serv, as does
+ * the nn->keep_active flag and various transient calls to svc_get().
*
* Finally, the nfsd_mutex also protects some of the global variables that are
* accessed when nfsd starts and that are settable via the write_* routines in
@@ -117,9 +117,7 @@ static struct svc_stat nfsd_acl_svcstats = {
static const struct svc_version *nfsd_version[] = {
[2] = &nfsd_version2,
-#if defined(CONFIG_NFSD_V3)
[3] = &nfsd_version3,
-#endif
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
#endif
@@ -293,13 +291,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
- error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
@@ -345,33 +343,57 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
}
-void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+/**
+ * nfsd_copy_write_verifier - Atomically copy a write verifier
+ * @verf: buffer in which to receive the verifier cookie
+ * @nn: NFS net namespace
+ *
+ * This function provides a wait-free mechanism for copying the
+ * namespace's write verifier without tearing it.
+ */
+void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn)
{
int seq = 0;
do {
- read_seqbegin_or_lock(&nn->boot_lock, &seq);
- /*
- * This is opaque to client, so no need to byte-swap. Use
- * __force to keep sparse happy. y2038 time_t overflow is
- * irrelevant in this usage
- */
- verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
- } while (need_seqretry(&nn->boot_lock, seq));
- done_seqretry(&nn->boot_lock, seq);
+ read_seqbegin_or_lock(&nn->writeverf_lock, &seq);
+ memcpy(verf, nn->writeverf, sizeof(*verf));
+ } while (need_seqretry(&nn->writeverf_lock, seq));
+ done_seqretry(&nn->writeverf_lock, seq);
}
-static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn)
{
- ktime_get_real_ts64(&nn->nfssvc_boot);
+ struct timespec64 now;
+ u64 verf;
+
+ /*
+ * Because the time value is hashed, y2038 time_t overflow
+ * is irrelevant in this usage.
+ */
+ ktime_get_raw_ts64(&now);
+ verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key);
+ memcpy(nn->writeverf, &verf, sizeof(nn->writeverf));
}
-void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+/**
+ * nfsd_reset_write_verifier - Generate a new write verifier
+ * @nn: NFS net namespace
+ *
+ * This function updates the ->writeverf field of @nn. This field
+ * contains an opaque cookie that, according to Section 18.32.3 of
+ * RFC 8881, "the client can use to determine whether a server has
+ * changed instance state (e.g., server restart) between a call to
+ * WRITE and a subsequent call to either WRITE or COMMIT. This
+ * cookie MUST be unchanged during a single instance of the NFSv4.1
+ * server and MUST be unique between instances of the NFSv4.1
+ * server."
+ */
+void nfsd_reset_write_verifier(struct nfsd_net *nn)
{
- write_seqlock(&nn->boot_lock);
- nfsd_reset_boot_verifier_locked(nn);
- write_sequnlock(&nn->boot_lock);
+ write_seqlock(&nn->writeverf_lock);
+ nfsd_reset_write_verifier_locked(nn);
+ write_sequnlock(&nn->writeverf_lock);
}
static int nfsd_startup_net(struct net *net, const struct cred *cred)
@@ -435,6 +457,7 @@ static void nfsd_shutdown_net(struct net *net)
nfsd_shutdown_generic();
}
+static DEFINE_SPINLOCK(nfsd_notifier_lock);
static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
@@ -444,18 +467,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct sockaddr_in sin;
- if ((event != NETDEV_DOWN) ||
- !atomic_inc_not_zero(&nn->ntf_refcnt))
+ if (event != NETDEV_DOWN || !nn->nfsd_serv)
goto out;
+ spin_lock(&nfsd_notifier_lock);
if (nn->nfsd_serv) {
dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ifa->ifa_local;
svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
}
- atomic_dec(&nn->ntf_refcnt);
- wake_up(&nn->ntf_wq);
+ spin_unlock(&nfsd_notifier_lock);
out:
return NOTIFY_DONE;
@@ -475,10 +497,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct sockaddr_in6 sin6;
- if ((event != NETDEV_DOWN) ||
- !atomic_inc_not_zero(&nn->ntf_refcnt))
+ if (event != NETDEV_DOWN || !nn->nfsd_serv)
goto out;
+ spin_lock(&nfsd_notifier_lock);
if (nn->nfsd_serv) {
dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
sin6.sin6_family = AF_INET6;
@@ -487,8 +509,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
sin6.sin6_scope_id = ifa->idev->dev->ifindex;
svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
}
- atomic_dec(&nn->ntf_refcnt);
- wake_up(&nn->ntf_wq);
+ spin_unlock(&nfsd_notifier_lock);
+
out:
return NOTIFY_DONE;
}
@@ -505,7 +527,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- atomic_dec(&nn->ntf_refcnt);
/* check if the notifier still has clients */
if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
@@ -513,7 +534,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
}
- wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
/*
* write_ports can create the server without actually starting
@@ -590,24 +610,6 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-static const struct svc_serv_ops nfsd_thread_sv_ops = {
- .svo_shutdown = nfsd_last_thread,
- .svo_function = nfsd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads,
- .svo_module = THIS_MODULE,
-};
-
-static void nfsd_complete_shutdown(struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- WARN_ON(!mutex_is_locked(&nfsd_mutex));
-
- nn->nfsd_serv = NULL;
- complete(&nn->nfsd_shutdown_complete);
-}
-
void nfsd_shutdown_threads(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -622,11 +624,9 @@ void nfsd_shutdown_threads(struct net *net)
svc_get(serv);
/* Kill outstanding nfsd threads */
- serv->sv_ops->svo_setup(serv, NULL, 0);
- nfsd_destroy(net);
+ svc_set_num_threads(serv, NULL, 0);
+ nfsd_put(net);
mutex_unlock(&nfsd_mutex);
- /* Wait for shutdown of nfsd_serv to complete */
- wait_for_completion(&nn->nfsd_shutdown_complete);
}
bool i_am_nfsd(void)
@@ -638,6 +638,7 @@ int nfsd_create_serv(struct net *net)
{
int error;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
WARN_ON(!mutex_is_locked(&nfsd_mutex));
if (nn->nfsd_serv) {
@@ -647,19 +648,22 @@ int nfsd_create_serv(struct net *net)
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- &nfsd_thread_sv_ops);
- if (nn->nfsd_serv == NULL)
+ serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
+ if (serv == NULL)
return -ENOMEM;
- init_completion(&nn->nfsd_shutdown_complete);
- nn->nfsd_serv->sv_maxconn = nn->max_connections;
- error = svc_bind(nn->nfsd_serv, net);
+ serv->sv_maxconn = nn->max_connections;
+ error = svc_bind(serv, net);
if (error < 0) {
- svc_destroy(nn->nfsd_serv);
- nfsd_complete_shutdown(net);
+ /* NOT nfsd_put() as notifiers (see below) haven't
+ * been set up yet.
+ */
+ svc_put(serv);
return error;
}
+ spin_lock(&nfsd_notifier_lock);
+ nn->nfsd_serv = serv;
+ spin_unlock(&nfsd_notifier_lock);
set_max_drc();
/* check if the notifier is already set */
@@ -669,8 +673,7 @@ int nfsd_create_serv(struct net *net)
register_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
}
- atomic_inc(&nn->ntf_refcnt);
- nfsd_reset_boot_verifier(nn);
+ nfsd_reset_write_verifier(nn);
return 0;
}
@@ -697,16 +700,27 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
return 0;
}
-void nfsd_destroy(struct net *net)
+/* This is the callback for kref_put() below.
+ * There is no code here as the first thing to be done is
+ * call svc_shutdown_net(), but we cannot get the 'net' from
+ * the kref. So do all the work when kref_put returns true.
+ */
+static void nfsd_noop(struct kref *ref)
+{
+}
+
+void nfsd_put(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
- if (destroy)
- svc_shutdown_net(nn->nfsd_serv, net);
- svc_destroy(nn->nfsd_serv);
- if (destroy)
- nfsd_complete_shutdown(net);
+ if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) {
+ svc_xprt_destroy_all(nn->nfsd_serv, net);
+ nfsd_last_thread(nn->nfsd_serv, net);
+ svc_destroy(&nn->nfsd_serv->sv_refcnt);
+ spin_lock(&nfsd_notifier_lock);
+ nn->nfsd_serv = NULL;
+ spin_unlock(&nfsd_notifier_lock);
+ }
}
int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
@@ -733,7 +747,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
if (tot > NFSD_MAXSERVS) {
/* total too large: scale down requested numbers */
for (i = 0; i < n && tot > 0; i++) {
- int new = nthreads[i] * NFSD_MAXSERVS / tot;
+ int new = nthreads[i] * NFSD_MAXSERVS / tot;
tot -= (nthreads[i] - new);
nthreads[i] = new;
}
@@ -753,12 +767,13 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
/* apply the new numbers */
svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
- err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
- &nn->nfsd_serv->sv_pools[i], nthreads[i]);
+ err = svc_set_num_threads(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i],
+ nthreads[i]);
if (err)
break;
}
- nfsd_destroy(net);
+ nfsd_put(net);
return err;
}
@@ -795,21 +810,19 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
error = nfsd_startup_net(net, cred);
if (error)
- goto out_destroy;
- error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
- NULL, nrservs);
+ goto out_put;
+ error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
if (error)
goto out_shutdown;
- /* We are holding a reference to nn->nfsd_serv which
- * we don't want to count in the return value,
- * so subtract 1
- */
- error = nn->nfsd_serv->sv_nrthreads - 1;
+ error = nn->nfsd_serv->sv_nrthreads;
out_shutdown:
if (error < 0 && !nfsd_up_before)
nfsd_shutdown_net(net);
-out_destroy:
- nfsd_destroy(net); /* Release server */
+out_put:
+ /* Threads now hold service active */
+ if (xchg(&nn->keep_active, 0))
+ nfsd_put(net);
+ nfsd_put(net);
out:
mutex_unlock(&nfsd_mutex);
return error;
@@ -923,9 +936,6 @@ nfsd(void *vrqstp)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int err;
- /* Lock module and set up kernel thread */
- mutex_lock(&nfsd_mutex);
-
/* At this point, the thread shares current->fs
* with the init process. We need to create files with the
* umask as defined by the client instead of init's umask. */
@@ -945,8 +955,7 @@ nfsd(void *vrqstp)
allow_signal(SIGINT);
allow_signal(SIGQUIT);
- nfsdstats.th_cnt++;
- mutex_unlock(&nfsd_mutex);
+ atomic_inc(&nfsdstats.th_cnt);
set_freezable();
@@ -973,20 +982,34 @@ nfsd(void *vrqstp)
/* Clear signals before calling svc_exit_thread() */
flush_signals(current);
- mutex_lock(&nfsd_mutex);
- nfsdstats.th_cnt --;
+ atomic_dec(&nfsdstats.th_cnt);
out:
- rqstp->rq_server = NULL;
+ /* Take an extra ref so that the svc_put in svc_exit_thread()
+ * doesn't call svc_destroy()
+ */
+ svc_get(nn->nfsd_serv);
/* Release the thread */
svc_exit_thread(rqstp);
- nfsd_destroy(net);
+ /* We need to drop a ref, but may not drop the last reference
+ * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that
+ * could deadlock with nfsd_shutdown_threads() waiting for us.
+ * So three options are:
+ * - drop a non-final reference,
+ * - get the mutex without waiting
+ * - sleep briefly andd try the above again
+ */
+ while (!svc_put_not_last(nn->nfsd_serv)) {
+ if (mutex_trylock(&nfsd_mutex)) {
+ nfsd_put(net);
+ mutex_unlock(&nfsd_mutex);
+ break;
+ }
+ msleep(20);
+ }
- /* Release module */
- mutex_unlock(&nfsd_mutex);
- module_put_and_exit(0);
return 0;
}
@@ -1004,9 +1027,6 @@ out:
int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
- struct kvec *argv = &rqstp->rq_arg.head[0];
- struct kvec *resv = &rqstp->rq_res.head[0];
- __be32 *p;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -1015,7 +1035,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_cachetype = proc->pc_cachetype;
svcxdr_init_decode(rqstp);
- if (!proc->pc_decode(rqstp, argv->iov_base))
+ if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
switch (nfsd_cache_lookup(rqstp)) {
@@ -1031,14 +1051,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
* Need to grab the location to store the status, as
* NFSv4 does some encoding while processing
*/
- p = resv->iov_base + resv->iov_len;
svcxdr_init_encode(rqstp);
*statp = proc->pc_func(rqstp);
if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
- if (!proc->pc_encode(rqstp, p))
+ if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
@@ -1065,29 +1084,29 @@ out_encode_err:
/**
* nfssvc_decode_voidarg - Decode void arguments
* @rqstp: Server RPC transaction context
- * @p: buffer containing arguments to decode
+ * @xdr: XDR stream positioned at arguments to decode
*
* Return values:
- * %0: Arguments were not valid
- * %1: Decoding was successful
+ * %false: Arguments were not valid
+ * %true: Decoding was successful
*/
-int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
/**
* nfssvc_encode_voidres - Encode void results
* @rqstp: Server RPC transaction context
- * @p: buffer in which to encode results
+ * @xdr: XDR stream into which to encode results
*
* Return values:
- * %0: Local error while encoding
- * %1: Encoding was successful
+ * %false: Local error while encoding
+ * %true: Encoding was successful
*/
-int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
@@ -1100,7 +1119,6 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
mutex_unlock(&nfsd_mutex);
return -ENODEV;
}
- /* bump up the psudo refcount while traversing */
svc_get(nn->nfsd_serv);
ret = svc_pool_stats_open(nn->nfsd_serv, file);
mutex_unlock(&nfsd_mutex);
@@ -1113,8 +1131,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file)
struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- /* this function really, really should have been called svc_put() */
- nfsd_destroy(net);
+ nfsd_put(net);
mutex_unlock(&nfsd_mutex);
return ret;
}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index a06c05fe3b42..aba8520b4b8b 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -64,7 +64,7 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
if (!p)
return false;
fh_init(fhp, NFS_FHSIZE);
- memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+ memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;
return true;
@@ -78,7 +78,7 @@ svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
p = xdr_reserve_space(xdr, NFS_FHSIZE);
if (!p)
return false;
- memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+ memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);
return true;
}
@@ -272,94 +272,81 @@ svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
* XDR decode functions
*/
-int
-nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->fh);
}
-int
-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_sattrargs *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->fh) &&
svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_diropargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len);
}
-int
-nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readargs *args = rqstp->rq_argp;
u32 totalcount;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
/* totalcount is ignored */
if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_writeargs *args = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
u32 beginoffset, totalcount;
- size_t remaining;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
/* beginoffset is ignored */
if (xdr_stream_decode_u32(xdr, &beginoffset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
- return 0;
+ return false;
/* totalcount is ignored */
if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
- return 0;
+ return false;
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
- return 0;
+ return false;
if (args->len > NFSSVC_MAXBLKSIZE_V2)
- return 0;
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->len))
- return 0;
- args->first.iov_base = xdr->p;
- args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
+ return false;
+ if (!xdr_stream_subsegment(xdr, &args->payload, args->len))
+ return false;
- return 1;
+ return true;
}
-int
-nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_createargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->fh,
@@ -367,10 +354,9 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_renameargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->ffh,
@@ -379,10 +365,9 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_linkargs *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->ffh) &&
@@ -390,178 +375,170 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
- return 0;
+ return false;
if (args->tlen == 0)
- return 0;
+ return false;
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
if (!args->first.iov_base)
- return 0;
+ return false;
return svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readdirargs *args = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->cookie) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* XDR encode functions
*/
-int
-nfssvc_encode_statres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_stat *resp = rqstp->rq_resp;
return svcxdr_encode_stat(xdr, resp->status);
}
-int
-nfssvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_attrstat *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fhandle(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, &resp->page, 0, resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readdirres *resp = rqstp->rq_resp;
struct xdr_buf *dirlist = &resp->dirlist;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_statfsres *resp = rqstp->rq_resp;
struct kstatfs *stat = &resp->stats;
+ __be32 *p;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
p = xdr_reserve_space(xdr, XDR_UNIT * 5);
if (!p)
- return 0;
+ return false;
*p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
*p++ = cpu_to_be32(stat->f_bsize);
*p++ = cpu_to_be32(stat->f_blocks);
@@ -570,7 +547,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
break;
}
- return 1;
+ return true;
}
/**
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e73bdbb1634a..95457cfd37fc 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -568,6 +568,10 @@ struct nfs4_ol_stateid {
struct list_head st_locks;
struct nfs4_stateowner *st_stateowner;
struct nfs4_clnt_odstate *st_clnt_odstate;
+/*
+ * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the
+ * comment above bmap_to_share_mode() for explanation:
+ */
unsigned char st_access_bmap;
unsigned char st_deny_bmap;
struct nfs4_ol_stateid *st_openstp;
@@ -629,6 +633,7 @@ struct nfsd4_blocked_lock {
struct file_lock nbl_lock;
struct knfsd_fh nbl_fh;
struct nfsd4_callback nbl_cb;
+ struct kref nbl_kref;
};
struct nfsd4_compound_state;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 1d3b881e7382..a8c5a02a84f0 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -45,7 +45,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
/* thread usage: */
- seq_printf(seq, "th %u 0", nfsdstats.th_cnt);
+ seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
/* deprecated thread usage histogram stats */
for (i = 0; i < 10; i++)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 51ecda852e23..9b43dc3d9991 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -29,11 +29,9 @@ enum {
struct nfsd_stats {
struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM];
- /* Protected by nfsd_mutex */
- unsigned int th_cnt; /* number of available threads */
+ atomic_t th_cnt; /* number of available threads */
};
-
extern struct nfsd_stats nfsdstats;
extern struct svc_stat nfsd_svcstats;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 538520957a81..242fa123e0e9 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,25 +9,10 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+
#include "export.h"
#include "nfsfh.h"
-#define NFSD_TRACE_PROC_ARG_FIELDS \
- __field(unsigned int, netns_ino) \
- __field(u32, xid) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
-
-#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
- do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
- } while (0);
-
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
@@ -46,49 +31,41 @@
rqstp->rq_xprt->xpt_remotelen); \
} while (0);
-TRACE_EVENT(nfsd_garbage_args_err,
+DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
TP_PROTO(
const struct svc_rqst *rqstp
),
TP_ARGS(rqstp),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_ARG_FIELDS
-
+ __field(unsigned int, netns_ino)
+ __field(u32, xid)
__field(u32, vers)
__field(u32, proc)
+ __sockaddr(server, rqstp->rq_xprt->xpt_locallen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
- NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+ const struct svc_xprt *xprt = rqstp->rq_xprt;
+ __entry->netns_ino = xprt->xpt_net->ns.inum;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->vers = rqstp->rq_vers;
__entry->proc = rqstp->rq_proc;
+ __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen);
+ __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen);
),
TP_printk("xid=0x%08x vers=%u proc=%u",
__entry->xid, __entry->vers, __entry->proc
)
);
-TRACE_EVENT(nfsd_cant_encode_err,
- TP_PROTO(
- const struct svc_rqst *rqstp
- ),
- TP_ARGS(rqstp),
- TP_STRUCT__entry(
- NFSD_TRACE_PROC_ARG_FIELDS
-
- __field(u32, vers)
- __field(u32, proc)
- ),
- TP_fast_assign(
- NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+#define DEFINE_NFSD_XDR_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
+ TP_PROTO(const struct svc_rqst *rqstp), \
+ TP_ARGS(rqstp))
- __entry->vers = rqstp->rq_vers;
- __entry->proc = rqstp->rq_proc;
- ),
- TP_printk("xid=0x%08x vers=%u proc=%u",
- __entry->xid, __entry->vers, __entry->proc
- )
-);
+DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
+DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
#define show_nfsd_may_flags(x) \
__print_flags(x, "|", \
@@ -319,14 +296,14 @@ TRACE_EVENT(nfsd_export_update,
DECLARE_EVENT_CLASS(nfsd_io_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
- loff_t offset,
- unsigned long len),
+ u64 offset,
+ u32 len),
TP_ARGS(rqstp, fhp, offset, len),
TP_STRUCT__entry(
__field(u32, xid)
__field(u32, fh_hash)
- __field(loff_t, offset)
- __field(unsigned long, len)
+ __field(u64, offset)
+ __field(u32, len)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqstp->rq_xid);
@@ -334,7 +311,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
__entry->offset = offset;
__entry->len = len;
),
- TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu",
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u",
__entry->xid, __entry->fh_hash,
__entry->offset, __entry->len)
)
@@ -343,8 +320,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
TP_PROTO(struct svc_rqst *rqstp, \
struct svc_fh *fhp, \
- loff_t offset, \
- unsigned long len), \
+ u64 offset, \
+ u32 len), \
TP_ARGS(rqstp, fhp, offset, len))
DEFINE_NFSD_IO_EVENT(read_start);
@@ -412,6 +389,56 @@ TRACE_EVENT(nfsd_dirent,
)
)
+DECLARE_EVENT_CLASS(nfsd_copy_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *src_fhp,
+ loff_t src_offset,
+ struct svc_fh *dst_fhp,
+ loff_t dst_offset,
+ u64 count,
+ int status),
+ TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, src_fh_hash)
+ __field(loff_t, src_offset)
+ __field(u32, dst_fh_hash)
+ __field(loff_t, dst_offset)
+ __field(u64, count)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle);
+ __entry->src_offset = src_offset;
+ __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle);
+ __entry->dst_offset = dst_offset;
+ __entry->count = count;
+ __entry->status = status;
+ ),
+ TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld "
+ "dst_fh_hash=0x%08x dst_offset=%lld "
+ "count=%llu status=%d",
+ __entry->xid, __entry->src_fh_hash, __entry->src_offset,
+ __entry->dst_fh_hash, __entry->dst_offset,
+ (unsigned long long)__entry->count,
+ __entry->status)
+)
+
+#define DEFINE_NFSD_COPY_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *src_fhp, \
+ loff_t src_offset, \
+ struct svc_fh *dst_fhp, \
+ loff_t dst_offset, \
+ u64 count, \
+ int status), \
+ TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \
+ count, status))
+
+DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err);
+
#include "state.h"
#include "filecache.h"
#include "vfs.h"
@@ -537,6 +564,34 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \
DEFINE_NET_EVENT(grace_start);
DEFINE_NET_EVENT(grace_complete);
+TRACE_EVENT(nfsd_writeverf_reset,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_rqst *rqstp,
+ int error
+ ),
+ TP_ARGS(nn, rqstp, error),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(u32, xid)
+ __field(int, error)
+ __array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->error = error;
+
+ /* avoid seqlock inside TP_fast_assign */
+ memcpy(__entry->verifier, nn->writeverf,
+ NFS4_VERIFIER_SIZE);
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s",
+ __entry->boot_time, __entry->xid, __entry->error,
+ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ )
+);
+
TRACE_EVENT(nfsd_clid_cred_mismatch,
TP_PROTO(
const struct nfs4_client *clp,
@@ -548,20 +603,21 @@ TRACE_EVENT(nfsd_clid_cred_mismatch,
__field(u32, cl_id)
__field(unsigned long, cl_flavor)
__field(unsigned long, new_flavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cl_flavor = clp->cl_cred.cr_flavor;
__entry->new_flavor = rqstp->rq_cred.cr_flavor;
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
show_nfsd_authflavor(__entry->cl_flavor),
- show_nfsd_authflavor(__entry->new_flavor), __entry->addr
+ show_nfsd_authflavor(__entry->new_flavor),
+ __get_sockaddr(addr)
)
)
@@ -577,7 +633,7 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
__field(u32, cl_id)
__array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
__array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -586,14 +642,14 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
NFS4_VERIFIER_SIZE);
memcpy(__entry->new_verifier, (void *)verf,
NFS4_VERIFIER_SIZE);
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
__print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
__print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
- __entry->addr
+ __get_sockaddr(addr)
)
);
@@ -843,18 +899,17 @@ TRACE_EVENT(nfsd_cb_args,
__field(u32, cl_id)
__field(u32, prog)
__field(u32, ident)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, conn->cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->prog = conn->cb_prog;
__entry->ident = conn->cb_ident;
- memcpy(__entry->addr, &conn->cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen);
),
TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->prog, __entry->ident)
);
@@ -886,17 +941,17 @@ DECLARE_EVENT_CLASS(nfsd_cb_class,
__field(unsigned long, state)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->state = clp->cl_cb_state;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x state=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
show_cb_state(__entry->state))
);
@@ -936,7 +991,7 @@ TRACE_EVENT(nfsd_cb_setup,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(unsigned long, authflavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
__array(unsigned char, netid, 8)
),
TP_fast_assign(
@@ -944,11 +999,11 @@ TRACE_EVENT(nfsd_cb_setup,
__entry->cl_id = clp->cl_clientid.cl_id;
strlcpy(__entry->netid, netid, sizeof(__entry->netid));
__entry->authflavor = authflavor;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->netid, show_nfsd_authflavor(__entry->authflavor))
);
@@ -962,30 +1017,32 @@ TRACE_EVENT(nfsd_cb_setup_err,
__field(long, error)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->error = error;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x error=%ld",
- __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->error)
);
-TRACE_EVENT(nfsd_cb_recall,
+TRACE_EVENT_CONDITION(nfsd_cb_recall,
TP_PROTO(
const struct nfs4_stid *stid
),
TP_ARGS(stid),
+ TP_CONDITION(stid->sc_client),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, si_id)
__field(u32, si_generation)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const stateid_t *stp = &stid->sc_stateid;
@@ -995,14 +1052,11 @@ TRACE_EVENT(nfsd_cb_recall,
__entry->cl_id = stp->si_opaque.so_clid.cl_id;
__entry->si_id = stp->si_opaque.so_id;
__entry->si_generation = stp->si_generation;
- if (clp)
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
- else
- memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation)
);
@@ -1016,7 +1070,7 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, fh_hash)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const struct nfs4_client *clp = lo->lo_owner.so_client;
@@ -1024,11 +1078,11 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->fh_hash)
);
@@ -1049,7 +1103,7 @@ TRACE_EVENT(nfsd_cb_offload,
__field(u32, fh_hash)
__field(int, status)
__field(u64, count)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
@@ -1059,11 +1113,11 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash = knfsd_fh_hash(fh);
__entry->status = be32_to_cpu(status);
__entry->count = count;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation,
__entry->fh_hash, __entry->count, __entry->status)
);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 738d564ca4ce..c22ad0532e8e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,20 +26,20 @@
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/ima.h>
+#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
-#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
#include "../internal.h"
#include "acl.h"
#include "idmap.h"
+#include "xdr4.h"
#endif /* CONFIG_NFSD_V4 */
#include "nfsd.h"
@@ -244,6 +244,7 @@ out_nfserr:
* returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all
* clients and is explicitly disallowed for NFSv3
+ * NeilBrown <neilb@cse.unsw.edu.au>
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
@@ -433,6 +434,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
.ia_size = iap->ia_size,
};
+ host_err = -EFBIG;
+ if (iap->ia_size < 0)
+ goto out_unlock;
+
host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
if (host_err)
goto out_unlock;
@@ -516,15 +521,23 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif
-__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
- struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync)
+static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
+{
+ return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate;
+}
+
+__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
+ struct nfsd_file *nf_src, u64 src_pos,
+ struct nfsd_file *nf_dst, u64 dst_pos,
+ u64 count, bool sync)
{
struct file *src = nf_src->nf_file;
struct file *dst = nf_dst->nf_file;
+ errseq_t since;
loff_t cloned;
__be32 ret = 0;
- down_write(&nf_dst->nf_rwsem);
+ since = READ_ONCE(dst->f_wb_err);
cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (cloned < 0) {
ret = nfserrno(cloned);
@@ -539,15 +552,25 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
if (!status)
+ status = filemap_check_wb_err(dst->f_mapping, since);
+ if (!status)
status = commit_inode_metadata(file_inode(src));
if (status < 0) {
- nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net,
- nfsd_net_id));
+ struct nfsd_net *nn = net_generic(nf_dst->nf_net,
+ nfsd_net_id);
+
+ trace_nfsd_clone_file_range_err(rqstp,
+ &nfsd4_get_cstate(rqstp)->save_fh,
+ src_pos,
+ &nfsd4_get_cstate(rqstp)->current_fh,
+ dst_pos,
+ count, status);
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, status);
ret = nfserrno(status);
}
}
out_err:
- up_write(&nf_dst->nf_rwsem);
return ret;
}
@@ -584,7 +607,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif /* defined(CONFIG_NFSD_V4) */
-#ifdef CONFIG_NFSD_V3
/*
* Check server access rights to a file system object
*/
@@ -696,7 +718,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
out:
return error;
}
-#endif /* CONFIG_NFSD_V3 */
int nfsd_open_break_lease(struct inode *inode, int access)
{
@@ -729,9 +750,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
path.dentry = fhp->fh_dentry;
inode = d_inode(path.dentry);
- /* Disallow write access to files with the append-only bit set
- * or any access when mandatory locking enabled
- */
err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
@@ -779,6 +797,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
__be32 err;
+ bool retried = false;
validate_process_creds();
/*
@@ -794,9 +813,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
*/
if (type == S_IFREG)
may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+retry:
err = fh_verify(rqstp, fhp, type, may_flags);
- if (!err)
+ if (!err) {
err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ if (err == nfserr_stale && !retried) {
+ retried = true;
+ fh_put(fhp);
+ goto retry;
+ }
+ }
validate_process_creds();
return err;
}
@@ -946,10 +972,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
unsigned long *cnt, int stable,
__be32 *verf)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file *file = nf->nf_file;
struct super_block *sb = file_inode(file)->i_sb;
struct svc_export *exp;
struct iov_iter iter;
+ errseq_t since;
__be32 nfserr;
int host_err;
int use_wgather;
@@ -987,36 +1015,28 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
flags |= RWF_SYNC;
iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
- if (flags & RWF_SYNC) {
- down_write(&nf->nf_rwsem);
- host_err = vfs_iter_write(file, &iter, &pos, flags);
- if (host_err < 0)
- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
- nfsd_net_id));
- up_write(&nf->nf_rwsem);
- } else {
- down_read(&nf->nf_rwsem);
- if (verf)
- nfsd_copy_boot_verifier(verf,
- net_generic(SVC_NET(rqstp),
- nfsd_net_id));
- host_err = vfs_iter_write(file, &iter, &pos, flags);
- up_read(&nf->nf_rwsem);
- }
+ since = READ_ONCE(file->f_wb_err);
+ if (verf)
+ nfsd_copy_write_verifier(verf, nn);
+ host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0) {
- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
- nfsd_net_id));
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, host_err);
goto out_nfserr;
}
*cnt = host_err;
nfsd_stats_io_write_add(exp, *cnt);
fsnotify_modify(file);
+ host_err = filemap_check_wb_err(file->f_mapping, since);
+ if (host_err < 0)
+ goto out_nfserr;
if (stable && use_wgather) {
host_err = wait_for_concurrent_writes(file);
- if (host_err < 0)
- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
- nfsd_net_id));
+ if (host_err < 0) {
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+ }
}
out_nfserr:
@@ -1090,78 +1110,82 @@ out:
return err;
}
-#ifdef CONFIG_NFSD_V3
-static int
-nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset,
- loff_t end)
-{
- struct address_space *mapping = nf->nf_file->f_mapping;
- int ret = filemap_fdatawrite_range(mapping, offset, end);
-
- if (ret)
- return ret;
- filemap_fdatawait_range_keep_errors(mapping, offset, end);
- return 0;
-}
-
-/*
- * Commit all pending writes to stable storage.
+/**
+ * nfsd_commit - Commit pending writes to stable storage
+ * @rqstp: RPC request being processed
+ * @fhp: NFS filehandle
+ * @offset: raw offset from beginning of file
+ * @count: raw count of bytes to sync
+ * @verf: filled in with the server's current write verifier
*
- * Note: we only guarantee that data that lies within the range specified
- * by the 'offset' and 'count' parameters will be synced.
+ * Note: we guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced. The server
+ * is permitted to sync data that lies outside this range at the
+ * same time.
*
* Unfortunately we cannot lock the file to make sure we return full WCC
* data to the client, as locking happens lower down in the filesystem.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
*/
__be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, unsigned long count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
+ u32 count, __be32 *verf)
{
+ u64 maxbytes;
+ loff_t start, end;
+ struct nfsd_net *nn;
struct nfsd_file *nf;
- loff_t end = LLONG_MAX;
- __be32 err = nfserr_inval;
-
- if (offset < 0)
- goto out;
- if (count != 0) {
- end = offset + (loff_t)count - 1;
- if (end < offset)
- goto out;
- }
+ __be32 err;
err = nfsd_file_acquire(rqstp, fhp,
NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
if (err)
goto out;
+
+ /*
+ * Convert the client-provided (offset, count) range to a
+ * (start, end) range. If the client-provided range falls
+ * outside the maximum file size of the underlying FS,
+ * clamp the sync range appropriately.
+ */
+ start = 0;
+ end = LLONG_MAX;
+ maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes;
+ if (offset < maxbytes) {
+ start = offset;
+ if (count && (offset + count - 1 < maxbytes))
+ end = offset + count - 1;
+ }
+
+ nn = net_generic(nf->nf_net, nfsd_net_id);
if (EX_ISSYNC(fhp->fh_export)) {
- int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end);
+ errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
+ int err2;
- down_write(&nf->nf_rwsem);
- if (!err2)
- err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+ err2 = vfs_fsync_range(nf->nf_file, start, end, 0);
switch (err2) {
case 0:
- nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
- nfsd_net_id));
+ nfsd_copy_write_verifier(verf, nn);
+ err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
+ since);
break;
case -EINVAL:
err = nfserr_notsupp;
break;
default:
- err = nfserrno(err2);
- nfsd_reset_boot_verifier(net_generic(nf->nf_net,
- nfsd_net_id));
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, err2);
}
- up_write(&nf->nf_rwsem);
+ err = nfserrno(err2);
} else
- nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
- nfsd_net_id));
+ nfsd_copy_write_verifier(verf, nn);
nfsd_file_put(nf);
out:
return err;
}
-#endif /* CONFIG_NFSD_V3 */
static __be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
@@ -1351,8 +1375,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
rdev, resfhp);
}
-#ifdef CONFIG_NFSD_V3
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
@@ -1410,7 +1432,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (nfsd_create_is_exclusive(createmode)) {
/* solaris7 gets confused (bugid 4218508) if these have
- * the high bit set, so just clear the high bits. If this is
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits. If this is
* ever changed to use different attrs for storing the
* verifier, then do_open_lookup() will also need to be fixed
* accordingly.
@@ -1517,7 +1540,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
goto out;
}
-#endif /* CONFIG_NFSD_V3 */
/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
@@ -1748,8 +1770,8 @@ retry:
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
ffhp->fh_locked = tfhp->fh_locked = true;
- fill_pre_wcc(ffhp);
- fill_pre_wcc(tfhp);
+ fh_fill_pre_attrs(ffhp);
+ fh_fill_pre_attrs(tfhp);
odentry = lookup_one_len(fname, fdentry, flen);
host_err = PTR_ERR(odentry);
@@ -1809,8 +1831,8 @@ retry:
* were the same, so again we do it by hand.
*/
if (!close_cached) {
- fill_post_wcc(ffhp);
- fill_post_wcc(tfhp);
+ fh_fill_post_attrs(ffhp);
+ fh_fill_post_attrs(tfhp);
}
unlock_rename(tdentry, fdentry);
ffhp->fh_locked = tfhp->fh_locked = false;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index b21b76e6b9a8..ccb87b2864f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -57,7 +57,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
-__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
+__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
+ struct nfsd_file *nf_src, u64 src_pos,
struct nfsd_file *nf_dst, u64 dst_pos,
u64 count, bool sync);
#endif /* CONFIG_NFSD_V4 */
@@ -67,15 +68,13 @@ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
-#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
u32 *verifier, bool *truncp, bool *created);
-__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
- loff_t, unsigned long, __be32 *verf);
-#endif /* CONFIG_NFSD_V3 */
+__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
+ u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index f45b4bc93f52..852f71580bd0 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -32,8 +32,8 @@ struct nfsd_readargs {
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
- struct kvec first;
+ __u32 len;
+ struct xdr_buf payload;
};
struct nfsd_createargs {
@@ -141,23 +141,24 @@ union nfsd_xdrstore {
#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
-int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_readargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_createargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfssvc_encode_statres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_attrstatres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
+bool nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+bool nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset);
int nfssvc_encode_entry(void *data, const char *name, int namlen,
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 933008382bbe..03fe4e21306c 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -40,7 +40,7 @@ struct nfsd3_writeargs {
__u32 count;
int stable;
__u32 len;
- struct kvec first;
+ struct xdr_buf payload;
};
struct nfsd3_createargs {
@@ -265,36 +265,37 @@ union nfsd3_xdrstore {
#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
-int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_getattrres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_lookupres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_createres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *);
+bool nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+bool nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
void nfs3svc_release_fhandle(struct svc_rqst *);
void nfs3svc_release_fhandle2(struct svc_rqst *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3e4052e3bd50..846ab6df9d48 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -702,10 +702,11 @@ struct nfsd4_compoundres {
struct xdr_stream *xdr;
struct svc_rqst * rqstp;
+ __be32 *statusp;
u32 taglen;
char * tag;
u32 opcnt;
- __be32 * tagp; /* tag, opcount encode location */
+
struct nfsd4_compound_state cstate;
};
@@ -756,8 +757,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
-int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
-int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
+bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index adf3bb0a8048..6ce8617b562d 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * alloc.c - NILFS dat/inode allocator
+ * NILFS dat/inode allocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 0303c3968cee..b667e869ac07 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ * Persistent object (dat entry/disk inode) allocator/deallocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 5900879d5693..798a2c1b38c6 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * bmap.c - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 2c63858e81c9..608168a5cb88 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * bmap.h - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4391fd3abd8f..ca611ac09f7c 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btnode.c - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -20,6 +20,23 @@
#include "page.h"
#include "btnode.h"
+
+/**
+ * nilfs_init_btnc_inode - initialize B-tree node cache inode
+ * @btnc_inode: inode to be initialized
+ *
+ * nilfs_init_btnc_inode() sets up an inode for B-tree node cache.
+ */
+void nilfs_init_btnc_inode(struct inode *btnc_inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(btnc_inode);
+
+ btnc_inode->i_mode = S_IFREG;
+ ii->i_flags = 0;
+ memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
+}
+
void nilfs_btnode_cache_clear(struct address_space *btnc)
{
invalidate_mapping_pages(btnc, 0, -1);
@@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
struct buffer_head *
nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
{
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct buffer_head *bh;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
@@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
struct buffer_head **pbh, sector_t *submit_ptr)
{
struct buffer_head *bh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct page *page;
int err;
@@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
struct nilfs_btnode_chkey_ctxt *ctxt)
{
struct buffer_head *obh, *nbh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
int err;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0f88dbc9bcb3..bd5544e63a01 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btnode.h - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt {
struct buffer_head *newbh;
};
+void nilfs_init_btnc_inode(struct inode *btnc_inode);
void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ab9ec073330f..f544c22fff78 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btree.c - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
__u64 ptr, struct buffer_head **bhp)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh;
bh = nilfs_btnode_create_block(btnc, ptr);
@@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
struct buffer_head **bhp,
const struct nilfs_btree_readahead_info *ra)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh, *ra_bh;
sector_t submit_ptr = 0;
int ret;
@@ -1741,6 +1743,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
dat = nilfs_bmap_get_dat(btree);
}
+ ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode);
+ if (ret < 0)
+ return ret;
+
ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
if (ret < 0)
return ret;
@@ -1913,7 +1919,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
path[level].bp_ctxt.bh = path[level].bp_bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0) {
nilfs_dat_abort_update(dat,
@@ -1939,7 +1945,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
if (buffer_nilfs_node(path[level].bp_bh)) {
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
path[level].bp_bh = path[level].bp_ctxt.bh;
}
@@ -1958,7 +1964,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
&path[level].bp_newreq.bpr_req);
if (buffer_nilfs_node(path[level].bp_bh))
nilfs_btnode_abort_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
}
@@ -2134,7 +2140,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
struct list_head *listp)
{
- struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btcache = btnc_inode->i_mapping;
struct list_head lists[NILFS_BTREE_LEVEL_MAX];
struct pagevec pvec;
struct buffer_head *bh, *head;
@@ -2188,12 +2195,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
path[level].bp_ctxt.newkey = blocknr;
path[level].bp_ctxt.bh = *bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0)
return ret;
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
*bh = path[level].bp_ctxt.bh;
}
@@ -2398,6 +2405,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
+ else
+ ret = nilfs_attach_btree_node_cache(
+ &NILFS_BMAP_I(bmap)->vfs_inode);
+
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index d1421b646ce4..92868e1a48ca 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btree.h - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index ce144776b4ef..9ebefb3acb0e 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * cpfile.c - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 6336222df24a..edabb2dc5756 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * cpfile.h - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8bccdf1158fc..3b55e239705f 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dat.c - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
di = NILFS_DAT_I(dat);
lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
nilfs_palloc_setup_cache(dat, &di->palloc_cache);
- nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ err = nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ if (err)
+ goto failed;
err = nilfs_read_inode_common(dat, raw_inode);
if (err)
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index b17ee34580ae..468c82d26183 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * dat.h - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 81394e22d0a0..f8f4c2ff52f4 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dir.c - NILFS directory entry operations
+ * NILFS directory entry operations
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index f353101955e3..a35f2795b242 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * direct.c - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index ec9a23c77994..b7ca896269af 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * direct.h - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7cf765258fda..a265d391ffe9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * file.c - NILFS regular file handling primitives including fsync().
+ * NILFS regular file handling primitives including fsync().
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 448320496856..04fdd420eae7 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ * Dummy inodes to buffer blocks for garbage collection
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
__u64 vbn, struct buffer_head **out_bh)
{
+ struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
int ret;
- ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+ ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
vbn ? : pbn, pbn, REQ_OP_READ, 0,
out_bh, &pbn);
if (ret == -EEXIST) /* internal code (cache hit) */
@@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
- return 0;
+ return nilfs_attach_btree_node_cache(inode);
}
/**
@@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 02727ed3a7c6..a8a4bc8490b4 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ifile.c - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index a1e1e5711a05..35c5273f4821 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * ifile.h - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2e8eb263cf0f..6045cea21f52 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * inode.c - NILFS inode operations.
+ * NILFS inode operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -29,12 +29,16 @@
* @cno: checkpoint number
* @root: pointer on NILFS root object (mounted checkpoint)
* @for_gc: inode for GC flag
+ * @for_btnc: inode for B-tree node cache flag
+ * @for_shadow: inode for shadowed page cache flag
*/
struct nilfs_iget_args {
u64 ino;
__u64 cno;
struct nilfs_root *root;
- int for_gc;
+ bool for_gc;
+ bool for_btnc;
+ bool for_shadow;
};
static int nilfs_iget_test(struct inode *inode, void *opaque);
@@ -199,23 +203,22 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
-static int nilfs_set_page_dirty(struct page *page)
+static bool nilfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- int ret = __set_page_dirty_nobuffers(page);
+ struct inode *inode = mapping->host;
+ struct buffer_head *head;
+ unsigned int nr_dirty = 0;
+ bool ret = filemap_dirty_folio(mapping, folio);
- if (page_has_buffers(page)) {
- unsigned int nr_dirty = 0;
- struct buffer_head *bh, *head;
+ /*
+ * The page may not be locked, eg if called from try_to_unmap_one()
+ */
+ spin_lock(&mapping->private_lock);
+ head = folio_buffers(folio);
+ if (head) {
+ struct buffer_head *bh = head;
- /*
- * This page is locked by callers, and no other thread
- * concurrently marks its buffers dirty since they are
- * only dirtied through routines in fs/buffer.c in
- * which call sites of mark_buffer_dirty are protected
- * by page lock.
- */
- bh = head = page_buffers(page);
do {
/* Do not mark hole blocks dirty */
if (buffer_dirty(bh) || !buffer_mapped(bh))
@@ -224,14 +227,13 @@ static int nilfs_set_page_dirty(struct page *page)
set_buffer_dirty(bh);
nr_dirty++;
} while (bh = bh->b_this_page, bh != head);
-
- if (nr_dirty)
- nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
+ }
+ spin_unlock(&mapping->private_lock);
+ if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
- }
return ret;
}
@@ -299,12 +301,12 @@ const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.writepages = nilfs_writepages,
- .set_page_dirty = nilfs_set_page_dirty,
+ .dirty_folio = nilfs_dirty_folio,
.readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
/* .releasepage = nilfs_releasepage, */
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.direct_IO = nilfs_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
};
@@ -314,7 +316,8 @@ static int nilfs_insert_inode_locked(struct inode *inode,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
@@ -527,6 +530,19 @@ static int nilfs_iget_test(struct inode *inode, void *opaque)
return 0;
ii = NILFS_I(inode);
+ if (test_bit(NILFS_I_BTNC, &ii->i_state)) {
+ if (!args->for_btnc)
+ return 0;
+ } else if (args->for_btnc) {
+ return 0;
+ }
+ if (test_bit(NILFS_I_SHADOW, &ii->i_state)) {
+ if (!args->for_shadow)
+ return 0;
+ } else if (args->for_shadow) {
+ return 0;
+ }
+
if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
return !args->for_gc;
@@ -538,15 +554,17 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
struct nilfs_iget_args *args = opaque;
inode->i_ino = args->ino;
- if (args->for_gc) {
+ NILFS_I(inode)->i_cno = args->cno;
+ NILFS_I(inode)->i_root = args->root;
+ if (args->root && args->ino == NILFS_ROOT_INO)
+ nilfs_get_root(args->root);
+
+ if (args->for_gc)
NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
- NILFS_I(inode)->i_cno = args->cno;
- NILFS_I(inode)->i_root = NULL;
- } else {
- if (args->root && args->ino == NILFS_ROOT_INO)
- nilfs_get_root(args->root);
- NILFS_I(inode)->i_root = args->root;
- }
+ if (args->for_btnc)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC);
+ if (args->for_shadow)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW);
return 0;
}
@@ -554,7 +572,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return ilookup5(sb, ino, nilfs_iget_test, &args);
@@ -564,7 +583,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
@@ -595,7 +615,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
__u64 cno)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+ .ino = ino, .root = NULL, .cno = cno, .for_gc = true,
+ .for_btnc = false, .for_shadow = false
};
struct inode *inode;
int err;
@@ -615,6 +636,113 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
return inode;
}
+/**
+ * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode
+ * @inode: inode object
+ *
+ * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode,
+ * or does nothing if the inode already has it. This function allocates
+ * an additional inode to maintain page cache of B-tree nodes one-on-one.
+ *
+ * Return Value: On success, 0 is returned. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode;
+ struct nilfs_iget_args args;
+
+ if (ii->i_assoc_inode)
+ return 0;
+
+ args.ino = inode->i_ino;
+ args.root = ii->i_root;
+ args.cno = ii->i_cno;
+ args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0;
+ args.for_btnc = true;
+ args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
+
+ btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!btnc_inode))
+ return -ENOMEM;
+ if (btnc_inode->i_state & I_NEW) {
+ nilfs_init_btnc_inode(btnc_inode);
+ unlock_new_inode(btnc_inode);
+ }
+ NILFS_I(btnc_inode)->i_assoc_inode = inode;
+ NILFS_I(btnc_inode)->i_bmap = ii->i_bmap;
+ ii->i_assoc_inode = btnc_inode;
+
+ return 0;
+}
+
+/**
+ * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode
+ * @inode: inode object
+ *
+ * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its
+ * holder inode bound to @inode, or does nothing if @inode doesn't have it.
+ */
+void nilfs_detach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode = ii->i_assoc_inode;
+
+ if (btnc_inode) {
+ NILFS_I(btnc_inode)->i_assoc_inode = NULL;
+ ii->i_assoc_inode = NULL;
+ iput(btnc_inode);
+ }
+}
+
+/**
+ * nilfs_iget_for_shadow - obtain inode for shadow mapping
+ * @inode: inode object that uses shadow mapping
+ *
+ * nilfs_iget_for_shadow() allocates a pair of inodes that holds page
+ * caches for shadow mapping. The page cache for data pages is set up
+ * in one inode and the one for b-tree node pages is set up in the
+ * other inode, which is attached to the former inode.
+ *
+ * Return Value: On success, a pointer to the inode for data pages is
+ * returned. On errors, one of the following negative error code is returned
+ * in a pointer type.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+struct inode *nilfs_iget_for_shadow(struct inode *inode)
+{
+ struct nilfs_iget_args args = {
+ .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = true
+ };
+ struct inode *s_inode;
+ int err;
+
+ s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!s_inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(s_inode->i_state & I_NEW))
+ return inode;
+
+ NILFS_I(s_inode)->i_flags = 0;
+ memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
+
+ err = nilfs_attach_btree_node_cache(s_inode);
+ if (unlikely(err)) {
+ iget_failed(s_inode);
+ return ERR_PTR(err);
+ }
+ unlock_new_inode(s_inode);
+ return s_inode;
+}
+
void nilfs_write_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode, int has_bmap)
{
@@ -762,7 +890,8 @@ static void nilfs_clear_inode(struct inode *inode)
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ if (!test_bit(NILFS_I_BTNC, &ii->i_state))
+ nilfs_detach_btree_node_cache(inode);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
nilfs_put_root(ii->i_root);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 640ac8fe891e..fec194a666f4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ioctl.c - NILFS ioctl operations.
+ * NILFS ioctl operations.
*
* Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
*
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
goto out;
ret = -ERANGE;
- if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+ if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
goto out;
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 97769fe4d588..d29a0f2b9c16 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * mdt.c - meta data file for NILFS
+ * Meta data file for NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -434,7 +434,8 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
static const struct address_space_operations def_mdt_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = nilfs_mdt_write_page,
};
@@ -470,9 +471,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
void nilfs_mdt_clear(struct inode *inode)
{
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+ struct nilfs_shadow_map *shadow = mdi->mi_shadow;
if (mdi->mi_palloc_cache)
nilfs_palloc_destroy_cache(inode);
+
+ if (shadow) {
+ struct inode *s_inode = shadow->inode;
+
+ shadow->inode = NULL;
+ iput(s_inode);
+ mdi->mi_shadow = NULL;
+ }
}
/**
@@ -506,12 +516,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+ struct inode *s_inode;
INIT_LIST_HEAD(&shadow->frozen_buffers);
- address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, inode);
- address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, inode);
+
+ s_inode = nilfs_iget_for_shadow(inode);
+ if (IS_ERR(s_inode))
+ return PTR_ERR(s_inode);
+
+ shadow->inode = s_inode;
mi->mi_shadow = shadow;
return 0;
}
@@ -525,14 +538,15 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode)
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_inode_info *ii = NILFS_I(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *s_inode = shadow->inode;
int ret;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+ ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping);
if (ret)
goto out;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
- &ii->i_btnode_cache);
+ ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping,
+ ii->i_assoc_inode->i_mapping);
if (ret)
goto out;
@@ -548,7 +562,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+ page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index);
if (!page)
return -ENOMEM;
@@ -580,7 +594,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int n;
- page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+ page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index);
if (page) {
if (page_has_buffers(page)) {
n = bh_offset(bh) >> inode->i_blkbits;
@@ -621,10 +635,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
nilfs_palloc_clear_cache(inode);
nilfs_clear_dirty_pages(inode->i_mapping, true);
- nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+ nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
- nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
- nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+ nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true);
+ nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
+ NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
@@ -639,10 +654,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;
down_write(&mi->mi_sem);
nilfs_release_frozen_buffers(shadow);
- truncate_inode_pages(&shadow->frozen_data, 0);
- truncate_inode_pages(&shadow->frozen_btnodes, 0);
+ truncate_inode_pages(shadow->inode->i_mapping, 0);
+ truncate_inode_pages(shadow_btnc_inode->i_mapping, 0);
up_write(&mi->mi_sem);
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e77aea4bb921..9e23bab3ff12 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * mdt.h - NILFS meta data file prototype and definitions
+ * NILFS meta data file prototype and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -18,14 +18,12 @@
/**
* struct nilfs_shadow_map - shadow mapping of meta data file
* @bmap_store: shadow copy of bmap state
- * @frozen_data: shadowed dirty data pages
- * @frozen_btnodes: shadowed dirty b-tree nodes' pages
+ * @inode: holder of page caches used in shadow mapping
* @frozen_buffers: list of frozen buffers
*/
struct nilfs_shadow_map {
struct nilfs_bmap_store bmap_store;
- struct address_space frozen_data;
- struct address_space frozen_btnodes;
+ struct inode *inode;
struct list_head frozen_buffers;
};
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 91eebeb0c48b..23899e0ae850 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * namei.c - NILFS pathname lookup operations.
+ * NILFS pathname lookup operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 60b21b6eeac0..1344f7d475d3 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * nilfs.h - NILFS local header file.
+ * NILFS local header file.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -28,7 +28,7 @@
* @i_xattr: <TODO>
* @i_dir_start_lookup: page index of last successful search
* @i_cno: checkpoint number for GC inode
- * @i_btnode_cache: cached pages of b-tree nodes
+ * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
* @i_dirty: list for connecting dirty files
* @xattr_sem: semaphore for extended attributes processing
* @i_bh: buffer contains disk inode
@@ -43,7 +43,7 @@ struct nilfs_inode_info {
__u64 i_xattr; /* sector_t ??? */
__u32 i_dir_start_lookup;
__u64 i_cno; /* check point number for GC inode */
- struct address_space i_btnode_cache;
+ struct inode *i_assoc_inode;
struct list_head i_dirty; /* List for connecting dirty files */
#ifdef CONFIG_NILFS_XATTR
@@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap)
return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
}
-static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
-{
- struct nilfs_inode_info *ii =
- container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
- return &ii->vfs_inode;
-}
-
/*
* Dynamic state flags of NILFS on-memory inode (i_state)
*/
@@ -98,6 +91,8 @@ enum {
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
NILFS_I_GCINODE, /* inode for GC, on memory only */
+ NILFS_I_BTNC, /* inode for btree node cache */
+ NILFS_I_SHADOW, /* inode for shadowed page cache */
};
/*
@@ -267,6 +262,9 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
unsigned long ino, __u64 cno);
+int nilfs_attach_btree_node_cache(struct inode *inode);
+void nilfs_detach_btree_node_cache(struct inode *inode);
+struct inode *nilfs_iget_for_shadow(struct inode *inode);
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 171fb5cd427f..a8e88cc38e16 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * page.c - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -195,12 +195,12 @@ void nilfs_page_bug(struct page *page)
*/
static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
{
- struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+ struct buffer_head *dbh, *dbufs, *sbh;
unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
BUG_ON(PageWriteback(dst));
- sbh = sbufs = page_buffers(src);
+ sbh = page_buffers(src);
if (!page_has_buffers(dst))
create_empty_buffers(dst, sbh->b_size, 0);
@@ -436,22 +436,12 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page,
return nc;
}
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
-{
- mapping->host = inode;
- mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
- mapping->a_ops = &empty_aops;
-}
-
/*
* NILFS2 needs clear_page_dirty() in the following two cases:
*
- * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
- * page dirty flags when it copies back pages from the shadow cache
- * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
- * (dat->{i_mapping,i_btnode_cache}).
+ * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty
+ * flag of pages when it copies back pages from shadow cache to the
+ * original cache.
*
* 2) Some B-tree operations like insertion or deletion may dispose buffers
* in dirty state, and this needs to cancel the dirty state of their pages.
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 62b9bb469e92..21ddcdd4d63e 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * page.h - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -43,7 +43,6 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 2217f904a7cf..9e2ed76c0f25 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * recovery.c - NILFS recovery logic
+ * NILFS recovery logic
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 56872e93823d..1362ccb64ec7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segbuf.c - NILFS segment buffer
+ * NILFS segment buffer
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -337,26 +337,12 @@ static void nilfs_end_bio_write(struct bio *bio)
}
static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
- struct nilfs_write_info *wi, int mode,
- int mode_flags)
+ struct nilfs_write_info *wi)
{
struct bio *bio = wi->bio;
- int err;
-
- if (segbuf->sb_nbio > 0 &&
- bdi_write_congested(segbuf->sb_super->s_bdi)) {
- wait_for_completion(&segbuf->sb_bio_event);
- segbuf->sb_nbio--;
- if (unlikely(atomic_read(&segbuf->sb_err))) {
- bio_put(bio);
- err = -EIO;
- goto failed;
- }
- }
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_set_op_attrs(bio, mode, mode_flags);
submit_bio(bio);
segbuf->sb_nbio++;
@@ -365,33 +351,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
wi->start = wi->end;
return 0;
-
- failed:
- wi->bio = NULL;
- return err;
-}
-
-/**
- * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @nilfs: nilfs object
- * @start: start block number of the bio
- * @nr_vecs: request size of page vector.
- *
- * Return Value: On success, pointer to the struct bio is returned.
- * On error, NULL is returned.
- */
-static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
- int nr_vecs)
-{
- struct bio *bio;
-
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (likely(bio)) {
- bio_set_dev(bio, nilfs->ns_bdev);
- bio->bi_iter.bi_sector =
- start << (nilfs->ns_blocksize_bits - 9);
- }
- return bio;
}
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
@@ -407,17 +366,17 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi,
- struct buffer_head *bh, int mode)
+ struct buffer_head *bh)
{
int len, err;
BUG_ON(wi->nr_vecs <= 0);
repeat:
if (!wi->bio) {
- wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
- wi->nr_vecs);
- if (unlikely(!wi->bio))
- return -ENOMEM;
+ wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs,
+ REQ_OP_WRITE, GFP_NOIO);
+ wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
+ (wi->nilfs->ns_blocksize_bits - 9);
}
len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -426,7 +385,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
return 0;
}
/* bio is FULL */
- err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0);
+ err = nilfs_segbuf_submit_bio(segbuf, wi);
/* never submit current bh */
if (likely(!err))
goto repeat;
@@ -456,13 +415,13 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
nilfs_segbuf_prepare_write(segbuf, &wi);
list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
@@ -472,8 +431,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
* Last BIO is always sent through the following
* submission.
*/
- res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE,
- REQ_SYNC);
+ wi.bio->bi_opf |= REQ_SYNC;
+ res = nilfs_segbuf_submit_bio(segbuf, &wi);
}
failed_bio:
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 9bea1bd59041..e20091ededba 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segbuf.h - NILFS Segment buffer prototypes and definitions
+ * NILFS Segment buffer prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 686c8ee7b29c..0afe0832c754 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segment.c - NILFS segment constructor.
+ * NILFS segment constructor.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -733,15 +733,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
struct list_head *listp)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct address_space *mapping = &ii->i_btnode_cache;
+ struct inode *btnc_inode = ii->i_assoc_inode;
struct pagevec pvec;
struct buffer_head *bh, *head;
unsigned int i;
pgoff_t index = 0;
+ if (!btnc_inode)
+ return;
+
pagevec_init(&pvec);
- while (pagevec_lookup_tag(&pvec, mapping, &index,
+ while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index,
PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
@@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
continue;
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index f5cf5308f3fc..1060f72ebf5a 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segment.h - NILFS Segment constructor prototypes and definitions
+ * NILFS Segment constructor prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 63722475e17e..e385cca2004a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sufile.c - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index c4e2c7a7add1..8e8a1a5a0402 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sufile.h - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f6b2d280aab5..ba108f915391 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * super.c - NILFS module and super block management.
+ * NILFS module and super block management.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -151,13 +151,14 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
- ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+ ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS);
if (!ii)
return NULL;
ii->i_bh = NULL;
ii->i_state = 0;
ii->i_cno = 0;
- nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
+ ii->i_assoc_inode = NULL;
+ ii->i_bmap = &ii->i_bmap_data;
return &ii->vfs_inode;
}
@@ -403,7 +404,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
int ret;
ret = -ERANGE;
- devsize = i_size_read(sb->s_bdev->bd_inode);
+ devsize = bdev_nr_bytes(sb->s_bdev);
if (newsize > devsize)
goto out;
@@ -1377,8 +1378,6 @@ static void nilfs_inode_init_once(void *obj)
#ifdef CONFIG_NILFS_XATTR
init_rwsem(&ii->xattr_sem);
#endif
- address_space_init_once(&ii->i_btnode_cache);
- ii->i_bmap = &ii->i_bmap_data;
inode_init_once(&ii->vfs_inode);
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 62f8a7ac19c8..379d22e28ed6 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sysfs.c - sysfs support implementation.
+ * Sysfs support implementation.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
@@ -57,7 +57,7 @@ static void nilfs_##name##_attr_release(struct kobject *kobj) \
complete(&subgroups->sg_##name##_kobj_unregister); \
} \
static struct kobj_type nilfs_##name##_ktype = { \
- .default_attrs = nilfs_##name##_attrs, \
+ .default_groups = nilfs_##name##_groups, \
.sysfs_ops = &nilfs_##name##_attr_ops, \
.release = nilfs_##name##_attr_release, \
}
@@ -95,7 +95,7 @@ static ssize_t
nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->inodes_count));
}
@@ -103,7 +103,7 @@ static ssize_t
nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->blocks_count));
}
@@ -116,7 +116,7 @@ static ssize_t
nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+ return sysfs_emit(buf, snapshot_readme_str);
}
NILFS_SNAPSHOT_RO_ATTR(inodes_count);
@@ -129,6 +129,7 @@ static struct attribute *nilfs_snapshot_attrs[] = {
NILFS_SNAPSHOT_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_snapshot);
static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
@@ -166,7 +167,7 @@ static const struct sysfs_ops nilfs_snapshot_attr_ops = {
};
static struct kobj_type nilfs_snapshot_ktype = {
- .default_attrs = nilfs_snapshot_attrs,
+ .default_groups = nilfs_snapshot_groups,
.sysfs_ops = &nilfs_snapshot_attr_ops,
.release = nilfs_snapshot_attr_release,
};
@@ -217,7 +218,7 @@ static ssize_t
nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+ return sysfs_emit(buf, mounted_snapshots_readme_str);
}
NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
@@ -226,6 +227,7 @@ static struct attribute *nilfs_mounted_snapshots_attrs[] = {
NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_mounted_snapshots);
NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
@@ -255,7 +257,7 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
ncheckpoints = cpstat.cs_ncps;
- return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+ return sysfs_emit(buf, "%llu\n", ncheckpoints);
}
static ssize_t
@@ -278,7 +280,7 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
nsnapshots = cpstat.cs_nsss;
- return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+ return sysfs_emit(buf, "%llu\n", nsnapshots);
}
static ssize_t
@@ -292,7 +294,7 @@ nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -306,7 +308,7 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static const char checkpoints_readme_str[] =
@@ -322,7 +324,7 @@ static ssize_t
nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+ return sysfs_emit(buf, checkpoints_readme_str);
}
NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
@@ -339,6 +341,7 @@ static struct attribute *nilfs_checkpoints_attrs[] = {
NILFS_CHECKPOINTS_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_checkpoints);
NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
@@ -353,7 +356,7 @@ nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_nsegments);
}
static ssize_t
@@ -361,7 +364,7 @@ nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_blocks_per_segment);
}
static ssize_t
@@ -375,7 +378,7 @@ nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+ return sysfs_emit(buf, "%lu\n", ncleansegs);
}
static ssize_t
@@ -395,7 +398,7 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
return err;
}
- return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+ return sysfs_emit(buf, "%llu\n", sustat.ss_ndirtysegs);
}
static const char segments_readme_str[] =
@@ -411,7 +414,7 @@ nilfs_segments_README_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, segments_readme_str);
+ return sysfs_emit(buf, segments_readme_str);
}
NILFS_SEGMENTS_RO_ATTR(segments_number);
@@ -428,6 +431,7 @@ static struct attribute *nilfs_segments_attrs[] = {
NILFS_SEGMENTS_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_segments);
NILFS_DEV_INT_GROUP_OPS(segments, dev);
NILFS_DEV_INT_GROUP_TYPE(segments, dev);
@@ -448,7 +452,7 @@ nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
last_pseg = nilfs->ns_last_pseg;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)last_pseg);
}
@@ -463,7 +467,7 @@ nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
last_seq = nilfs->ns_last_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+ return sysfs_emit(buf, "%llu\n", last_seq);
}
static ssize_t
@@ -477,7 +481,7 @@ nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -491,7 +495,7 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
seg_seq = nilfs->ns_seg_seq;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+ return sysfs_emit(buf, "%llu\n", seg_seq);
}
static ssize_t
@@ -505,7 +509,7 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
segnum = nilfs->ns_segnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+ return sysfs_emit(buf, "%llu\n", segnum);
}
static ssize_t
@@ -519,7 +523,7 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
nextnum = nilfs->ns_nextnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+ return sysfs_emit(buf, "%llu\n", nextnum);
}
static ssize_t
@@ -533,7 +537,7 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
pseg_offset = nilfs->ns_pseg_offset;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+ return sysfs_emit(buf, "%lu\n", pseg_offset);
}
static ssize_t
@@ -547,7 +551,7 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static ssize_t
@@ -575,7 +579,7 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
ctime = nilfs->ns_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", ctime);
+ return sysfs_emit(buf, "%llu\n", ctime);
}
static ssize_t
@@ -603,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
nongc_ctime = nilfs->ns_nongc_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime);
+ return sysfs_emit(buf, "%llu\n", nongc_ctime);
}
static ssize_t
@@ -617,7 +621,7 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+ return sysfs_emit(buf, "%u\n", ndirtyblks);
}
static const char segctor_readme_str[] =
@@ -654,7 +658,7 @@ static ssize_t
nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+ return sysfs_emit(buf, segctor_readme_str);
}
NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
@@ -689,6 +693,7 @@ static struct attribute *nilfs_segctor_attrs[] = {
NILFS_SEGCTOR_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_segctor);
NILFS_DEV_INT_GROUP_OPS(segctor, dev);
NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
@@ -723,7 +728,7 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
sbwtime = nilfs->ns_sbwtime;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime);
+ return sysfs_emit(buf, "%llu\n", sbwtime);
}
static ssize_t
@@ -737,7 +742,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
sbwcount = nilfs->ns_sbwcount;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+ return sysfs_emit(buf, "%u\n", sbwcount);
}
static ssize_t
@@ -751,7 +756,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
sb_update_freq = nilfs->ns_sb_update_freq;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+ return sysfs_emit(buf, "%u\n", sb_update_freq);
}
static ssize_t
@@ -799,7 +804,7 @@ static ssize_t
nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, sb_readme_str);
+ return sysfs_emit(buf, sb_readme_str);
}
NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
@@ -816,6 +821,7 @@ static struct attribute *nilfs_superblock_attrs[] = {
NILFS_SUPERBLOCK_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_superblock);
NILFS_DEV_INT_GROUP_OPS(superblock, dev);
NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
@@ -834,7 +840,7 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
u32 major = le32_to_cpu(sbp[0]->s_rev_level);
u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
- return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+ return sysfs_emit(buf, "%d.%d\n", major, minor);
}
static
@@ -842,7 +848,7 @@ ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+ return sysfs_emit(buf, "%u\n", nilfs->ns_blocksize);
}
static
@@ -853,7 +859,7 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
struct nilfs_super_block **sbp = nilfs->ns_sbp;
u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
- return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+ return sysfs_emit(buf, "%llu\n", dev_size);
}
static
@@ -864,7 +870,7 @@ ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
sector_t free_blocks = 0;
nilfs_count_free_blocks(nilfs, &free_blocks);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)free_blocks);
}
@@ -875,7 +881,7 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
- return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+ return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid);
}
static
@@ -903,7 +909,7 @@ static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, dev_readme_str);
+ return sysfs_emit(buf, dev_readme_str);
}
NILFS_DEV_RO_ATTR(revision);
@@ -924,6 +930,7 @@ static struct attribute *nilfs_dev_attrs[] = {
NILFS_DEV_ATTR_LIST(README),
NULL,
};
+ATTRIBUTE_GROUPS(nilfs_dev);
static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
@@ -961,7 +968,7 @@ static const struct sysfs_ops nilfs_dev_attr_ops = {
};
static struct kobj_type nilfs_dev_ktype = {
- .default_attrs = nilfs_dev_attrs,
+ .default_groups = nilfs_dev_groups,
.sysfs_ops = &nilfs_dev_attr_ops,
.release = nilfs_dev_attr_release,
};
@@ -1047,7 +1054,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+ return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
}
@@ -1060,7 +1067,7 @@ static ssize_t nilfs_feature_README_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, features_readme_str);
+ return sysfs_emit(buf, features_readme_str);
}
NILFS_FEATURE_RO_ATTR(revision);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index d001eb862dae..78a87a016928 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sysfs.h - sysfs support declarations.
+ * Sysfs support declarations.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8bfc01da5d7..dd48a8f74d57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * the_nilfs.c - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
- u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+ u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
int valid[2], swp = 0;
sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 987c8ab02aee..47c7dfbb7ea5 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * the_nilfs.h - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e85e13c50d6d..829dd4a61b66 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -19,7 +19,25 @@
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
-int dir_notify_enable __read_mostly = 1;
+static int dir_notify_enable __read_mostly = 1;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table dnotify_sysctls[] = {
+ {
+ .procname = "dir-notify-enable",
+ .data = &dir_notify_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+static void __init dnotify_sysctl_init(void)
+{
+ register_sysctl_init("fs", dnotify_sysctls);
+}
+#else
+#define dnotify_sysctl_init() do { } while (0)
+#endif
static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
@@ -196,7 +214,7 @@ static __u32 convert_arg(unsigned long arg)
if (arg & DN_ATTRIB)
new_mask |= FS_ATTRIB;
if (arg & DN_RENAME)
- new_mask |= FS_DN_RENAME;
+ new_mask |= FS_RENAME;
if (arg & DN_CREATE)
new_mask |= (FS_CREATE | FS_MOVED_TO);
@@ -386,6 +404,7 @@ static int __init dnotify_init(void)
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
panic("unable to allocate fsnotify group for dnotify\n");
+ dnotify_sysctl_init();
return 0;
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 057abd2cf887..985e995d2a39 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -76,8 +76,10 @@ static bool fanotify_info_equal(struct fanotify_info *info1,
struct fanotify_info *info2)
{
if (info1->dir_fh_totlen != info2->dir_fh_totlen ||
+ info1->dir2_fh_totlen != info2->dir2_fh_totlen ||
info1->file_fh_totlen != info2->file_fh_totlen ||
- info1->name_len != info2->name_len)
+ info1->name_len != info2->name_len ||
+ info1->name2_len != info2->name2_len)
return false;
if (info1->dir_fh_totlen &&
@@ -85,14 +87,24 @@ static bool fanotify_info_equal(struct fanotify_info *info1,
fanotify_info_dir_fh(info2)))
return false;
+ if (info1->dir2_fh_totlen &&
+ !fanotify_fh_equal(fanotify_info_dir2_fh(info1),
+ fanotify_info_dir2_fh(info2)))
+ return false;
+
if (info1->file_fh_totlen &&
!fanotify_fh_equal(fanotify_info_file_fh(info1),
fanotify_info_file_fh(info2)))
return false;
- return !info1->name_len ||
- !memcmp(fanotify_info_name(info1), fanotify_info_name(info2),
- info1->name_len);
+ if (info1->name_len &&
+ memcmp(fanotify_info_name(info1), fanotify_info_name(info2),
+ info1->name_len))
+ return false;
+
+ return !info1->name2_len ||
+ !memcmp(fanotify_info_name2(info1), fanotify_info_name2(info2),
+ info1->name2_len);
}
static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
@@ -111,6 +123,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
return fanotify_info_equal(info1, info2);
}
+static bool fanotify_error_event_equal(struct fanotify_error_event *fee1,
+ struct fanotify_error_event *fee2)
+{
+ /* Error events against the same file system are always merged. */
+ if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid))
+ return false;
+
+ return true;
+}
+
static bool fanotify_should_merge(struct fanotify_event *old,
struct fanotify_event *new)
{
@@ -131,6 +153,13 @@ static bool fanotify_should_merge(struct fanotify_event *old,
if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR))
return false;
+ /*
+ * FAN_RENAME event is reported with special info record types,
+ * so we cannot merge it with other events.
+ */
+ if ((old->mask & FAN_RENAME) != (new->mask & FAN_RENAME))
+ return false;
+
switch (old->type) {
case FANOTIFY_EVENT_TYPE_PATH:
return fanotify_path_equal(fanotify_event_path(old),
@@ -141,6 +170,9 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FID_NAME:
return fanotify_name_event_equal(FANOTIFY_NE(old),
FANOTIFY_NE(new));
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ return fanotify_error_event_equal(FANOTIFY_EE(old),
+ FANOTIFY_EE(new));
default:
WARN_ON_ONCE(1);
}
@@ -176,6 +208,10 @@ static int fanotify_merge(struct fsnotify_group *group,
break;
if (fanotify_should_merge(old, new)) {
old->mask |= new->mask;
+
+ if (fanotify_is_error_event(old->mask))
+ FANOTIFY_EE(old)->err_count++;
+
return 1;
}
}
@@ -255,8 +291,9 @@ out:
*/
static u32 fanotify_group_event_mask(struct fsnotify_group *group,
struct fsnotify_iter_info *iter_info,
- u32 event_mask, const void *data,
- int data_type, struct inode *dir)
+ u32 *match_mask, u32 event_mask,
+ const void *data, int data_type,
+ struct inode *dir)
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS |
@@ -282,7 +319,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
return 0;
}
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
@@ -301,11 +338,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
* If the event is on a child and this mark is on a parent not
* watching children, don't send it!
*/
- if (type == FSNOTIFY_OBJ_TYPE_PARENT &&
+ if (type == FSNOTIFY_ITER_TYPE_PARENT &&
!(mark->mask & FS_EVENT_ON_CHILD))
continue;
marks_mask |= mark->mask;
+
+ /* Record the mark types of this group that matched the event */
+ *match_mask |= 1U << type;
}
test_mask = event_mask & marks_mask & ~marks_ignored_mask;
@@ -343,13 +383,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
static int fanotify_encode_fh_len(struct inode *inode)
{
int dwords = 0;
+ int fh_len;
if (!inode)
return 0;
exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ fh_len = dwords << 2;
- return dwords << 2;
+ /*
+ * struct fanotify_error_event might be preallocated and is
+ * limited to MAX_HANDLE_SZ. This should never happen, but
+ * safeguard by forcing an invalid file handle.
+ */
+ if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ))
+ return 0;
+
+ return fh_len;
}
/*
@@ -370,15 +420,21 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = FILEID_ROOT;
fh->len = 0;
fh->flags = 0;
+
+ /*
+ * Invalid FHs are used by FAN_FS_ERROR for errors not
+ * linked to any inode. The f_handle won't be reported
+ * back to userspace.
+ */
if (!inode)
- return 0;
+ goto out;
/*
* !gpf means preallocated variable size fh, but fh_len could
* be zero in that case if encoding fh len failed.
*/
err = -ENOENT;
- if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4))
+ if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4) || fh_len > MAX_HANDLE_SZ)
goto out_err;
/* No external buffer in a variable size allocated fh */
@@ -403,8 +459,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = type;
fh->len = fh_len;
- /* Mix fh into event merge key */
- *hash ^= fanotify_hash_fh(fh);
+out:
+ /*
+ * Mix fh into event merge key. Hash might be NULL in case of
+ * unhashed FID events (i.e. FAN_FS_ERROR).
+ */
+ if (hash)
+ *hash ^= fanotify_hash_fh(fh);
return FANOTIFY_FH_HDR_LEN + fh_len;
@@ -420,17 +481,41 @@ out_err:
}
/*
- * The inode to use as identifier when reporting fid depends on the event.
- * Report the modified directory inode on dirent modification events.
- * Report the "victim" inode otherwise.
+ * FAN_REPORT_FID is ambiguous in that it reports the fid of the child for
+ * some events and the fid of the parent for create/delete/move events.
+ *
+ * With the FAN_REPORT_TARGET_FID flag, the fid of the child is reported
+ * also in create/delete/move events in addition to the fid of the parent
+ * and the name of the child.
+ */
+static inline bool fanotify_report_child_fid(unsigned int fid_mode, u32 mask)
+{
+ if (mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return (fid_mode & FAN_REPORT_TARGET_FID);
+
+ return (fid_mode & FAN_REPORT_FID) && !(mask & FAN_ONDIR);
+}
+
+/*
+ * The inode to use as identifier when reporting fid depends on the event
+ * and the group flags.
+ *
+ * With the group flag FAN_REPORT_TARGET_FID, always report the child fid.
+ *
+ * Without the group flag FAN_REPORT_TARGET_FID, report the modified directory
+ * fid on dirent events and the child fid otherwise.
+ *
* For example:
- * FS_ATTRIB reports the child inode even if reported on a watched parent.
- * FS_CREATE reports the modified dir inode and not the created inode.
+ * FS_ATTRIB reports the child fid even if reported on a watched parent.
+ * FS_CREATE reports the modified dir fid without FAN_REPORT_TARGET_FID.
+ * and reports the created child fid with FAN_REPORT_TARGET_FID.
*/
static struct inode *fanotify_fid_inode(u32 event_mask, const void *data,
- int data_type, struct inode *dir)
+ int data_type, struct inode *dir,
+ unsigned int fid_mode)
{
- if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ if ((event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) &&
+ !(fid_mode & FAN_REPORT_TARGET_FID))
return dir;
return fsnotify_data_inode(data, data_type);
@@ -452,7 +537,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
return dir;
- if (S_ISDIR(inode->i_mode))
+ if (inode && S_ISDIR(inode->i_mode))
return inode;
return dir;
@@ -514,25 +599,34 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id,
return &ffe->fae;
}
-static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
+static struct fanotify_event *fanotify_alloc_name_event(struct inode *dir,
__kernel_fsid_t *fsid,
const struct qstr *name,
struct inode *child,
+ struct dentry *moved,
unsigned int *hash,
gfp_t gfp)
{
struct fanotify_name_event *fne;
struct fanotify_info *info;
struct fanotify_fh *dfh, *ffh;
- unsigned int dir_fh_len = fanotify_encode_fh_len(id);
+ struct inode *dir2 = moved ? d_inode(moved->d_parent) : NULL;
+ const struct qstr *name2 = moved ? &moved->d_name : NULL;
+ unsigned int dir_fh_len = fanotify_encode_fh_len(dir);
+ unsigned int dir2_fh_len = fanotify_encode_fh_len(dir2);
unsigned int child_fh_len = fanotify_encode_fh_len(child);
- unsigned int size;
-
- size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len;
+ unsigned long name_len = name ? name->len : 0;
+ unsigned long name2_len = name2 ? name2->len : 0;
+ unsigned int len, size;
+
+ /* Reserve terminating null byte even for empty name */
+ size = sizeof(*fne) + name_len + name2_len + 2;
+ if (dir_fh_len)
+ size += FANOTIFY_FH_HDR_LEN + dir_fh_len;
+ if (dir2_fh_len)
+ size += FANOTIFY_FH_HDR_LEN + dir2_fh_len;
if (child_fh_len)
size += FANOTIFY_FH_HDR_LEN + child_fh_len;
- if (name)
- size += name->len + 1;
fne = kmalloc(size, gfp);
if (!fne)
return NULL;
@@ -542,40 +636,97 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
*hash ^= fanotify_hash_fsid(fsid);
info = &fne->info;
fanotify_info_init(info);
- dfh = fanotify_info_dir_fh(info);
- info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0);
+ if (dir_fh_len) {
+ dfh = fanotify_info_dir_fh(info);
+ len = fanotify_encode_fh(dfh, dir, dir_fh_len, hash, 0);
+ fanotify_info_set_dir_fh(info, len);
+ }
+ if (dir2_fh_len) {
+ dfh = fanotify_info_dir2_fh(info);
+ len = fanotify_encode_fh(dfh, dir2, dir2_fh_len, hash, 0);
+ fanotify_info_set_dir2_fh(info, len);
+ }
if (child_fh_len) {
ffh = fanotify_info_file_fh(info);
- info->file_fh_totlen = fanotify_encode_fh(ffh, child,
- child_fh_len, hash, 0);
+ len = fanotify_encode_fh(ffh, child, child_fh_len, hash, 0);
+ fanotify_info_set_file_fh(info, len);
}
- if (name) {
- long salt = name->len;
-
+ if (name_len) {
fanotify_info_copy_name(info, name);
- *hash ^= full_name_hash((void *)salt, name->name, name->len);
+ *hash ^= full_name_hash((void *)name_len, name->name, name_len);
+ }
+ if (name2_len) {
+ fanotify_info_copy_name2(info, name2);
+ *hash ^= full_name_hash((void *)name2_len, name2->name,
+ name2_len);
}
- pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n",
- __func__, id->i_ino, size, dir_fh_len, child_fh_len,
+ pr_debug("%s: size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n",
+ __func__, size, dir_fh_len, child_fh_len,
info->name_len, info->name_len, fanotify_info_name(info));
+ if (dir2_fh_len) {
+ pr_debug("%s: dir2_fh_len=%u name2_len=%u name2='%.*s'\n",
+ __func__, dir2_fh_len, info->name2_len,
+ info->name2_len, fanotify_info_name2(info));
+ }
+
return &fne->fae;
}
-static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
- u32 mask, const void *data,
- int data_type, struct inode *dir,
- const struct qstr *file_name,
- __kernel_fsid_t *fsid)
+static struct fanotify_event *fanotify_alloc_error_event(
+ struct fsnotify_group *group,
+ __kernel_fsid_t *fsid,
+ const void *data, int data_type,
+ unsigned int *hash)
+{
+ struct fs_error_report *report =
+ fsnotify_data_error_report(data, data_type);
+ struct inode *inode;
+ struct fanotify_error_event *fee;
+ int fh_len;
+
+ if (WARN_ON_ONCE(!report))
+ return NULL;
+
+ fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS);
+ if (!fee)
+ return NULL;
+
+ fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
+ fee->error = report->error;
+ fee->err_count = 1;
+ fee->fsid = *fsid;
+
+ inode = report->inode;
+ fh_len = fanotify_encode_fh_len(inode);
+
+ /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */
+ if (!fh_len && inode)
+ inode = NULL;
+
+ fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0);
+
+ *hash ^= fanotify_hash_fsid(fsid);
+
+ return &fee->fae;
+}
+
+static struct fanotify_event *fanotify_alloc_event(
+ struct fsnotify_group *group,
+ u32 mask, const void *data, int data_type,
+ struct inode *dir, const struct qstr *file_name,
+ __kernel_fsid_t *fsid, u32 match_mask)
{
struct fanotify_event *event = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
- struct inode *id = fanotify_fid_inode(mask, data, data_type, dir);
+ unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ struct inode *id = fanotify_fid_inode(mask, data, data_type, dir,
+ fid_mode);
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
struct mem_cgroup *old_memcg;
+ struct dentry *moved = NULL;
struct inode *child = NULL;
bool name_event = false;
unsigned int hash = 0;
@@ -584,11 +735,10 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) {
/*
- * With both flags FAN_REPORT_DIR_FID and FAN_REPORT_FID, we
- * report the child fid for events reported on a non-dir child
+ * For certain events and group flags, report the child fid
* in addition to reporting the parent fid and maybe child name.
*/
- if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir)
+ if (fanotify_report_child_fid(fid_mode, mask) && id != dirid)
child = id;
id = dirid;
@@ -612,6 +762,38 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
} else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) {
name_event = true;
}
+
+ /*
+ * In the special case of FAN_RENAME event, use the match_mask
+ * to determine if we need to report only the old parent+name,
+ * only the new parent+name or both.
+ * 'dirid' and 'file_name' are the old parent+name and
+ * 'moved' has the new parent+name.
+ */
+ if (mask & FAN_RENAME) {
+ bool report_old, report_new;
+
+ if (WARN_ON_ONCE(!match_mask))
+ return NULL;
+
+ /* Report both old and new parent+name if sb watching */
+ report_old = report_new =
+ match_mask & (1U << FSNOTIFY_ITER_TYPE_SB);
+ report_old |=
+ match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE);
+ report_new |=
+ match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE2);
+
+ if (!report_old) {
+ /* Do not report old parent+name */
+ dirid = NULL;
+ file_name = NULL;
+ }
+ if (report_new) {
+ /* Report new parent+name */
+ moved = fsnotify_data_dentry(data, data_type);
+ }
+ }
}
/*
@@ -630,9 +812,12 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
- } else if (name_event && (file_name || child)) {
- event = fanotify_alloc_name_event(id, fsid, file_name, child,
- &hash, gfp);
+ } else if (fanotify_is_error_event(mask)) {
+ event = fanotify_alloc_error_event(group, fsid, data,
+ data_type, &hash);
+ } else if (name_event && (file_name || moved || child)) {
+ event = fanotify_alloc_name_event(dirid, fsid, file_name, child,
+ moved, &hash, gfp);
} else if (fid_mode) {
event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
} else {
@@ -667,7 +852,7 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
int type;
__kernel_fsid_t fsid = {};
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
struct fsnotify_mark_connector *conn;
if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -702,6 +887,9 @@ static void fanotify_insert_event(struct fsnotify_group *group,
assert_spin_locked(&group->notification_lock);
+ if (!fanotify_is_hashed_event(event->mask))
+ return;
+
pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
group, event, bucket);
@@ -718,6 +906,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
struct fanotify_event *event;
struct fsnotify_event *fsn_event;
__kernel_fsid_t fsid = {};
+ u32 match_mask = 0;
BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
@@ -738,15 +927,18 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
+ BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
+ BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
- mask = fanotify_group_event_mask(group, iter_info, mask, data,
- data_type, dir);
+ mask = fanotify_group_event_mask(group, iter_info, &match_mask,
+ mask, data, data_type, dir);
if (!mask)
return 0;
- pr_debug("%s: group=%p mask=%x\n", __func__, group, mask);
+ pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
+ group, mask, match_mask);
if (fanotify_is_perm_event(mask)) {
/*
@@ -765,7 +957,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
}
event = fanotify_alloc_event(group, mask, data, data_type, dir,
- file_name, &fsid);
+ file_name, &fsid, match_mask);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -778,9 +970,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
}
fsn_event = &event->fse;
- ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
- fanotify_is_hashed_event(mask) ?
- fanotify_insert_event : NULL);
+ ret = fsnotify_insert_event(group, fsn_event, fanotify_merge,
+ fanotify_insert_event);
if (ret) {
/* Permission events shouldn't be merged */
BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -805,6 +996,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
UCOUNT_FANOTIFY_GROUPS);
+
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ mempool_exit(&group->fanotify_data.error_events_pool);
}
static void fanotify_free_path_event(struct fanotify_event *event)
@@ -833,7 +1027,16 @@ static void fanotify_free_name_event(struct fanotify_event *event)
kfree(FANOTIFY_NE(event));
}
-static void fanotify_free_event(struct fsnotify_event *fsn_event)
+static void fanotify_free_error_event(struct fsnotify_group *group,
+ struct fanotify_event *event)
+{
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ mempool_free(fee, &group->fanotify_data.error_events_pool);
+}
+
+static void fanotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
struct fanotify_event *event;
@@ -855,6 +1058,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
case FANOTIFY_EVENT_TYPE_OVERFLOW:
kfree(event);
break;
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ fanotify_free_error_event(group, event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4a5e555dc3d2..a3d5b751cac5 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -40,15 +40,45 @@ struct fanotify_fh {
struct fanotify_info {
/* size of dir_fh/file_fh including fanotify_fh hdr size */
u8 dir_fh_totlen;
+ u8 dir2_fh_totlen;
u8 file_fh_totlen;
u8 name_len;
- u8 pad;
+ u8 name2_len;
+ u8 pad[3];
unsigned char buf[];
/*
* (struct fanotify_fh) dir_fh starts at buf[0]
- * (optional) file_fh starts at buf[dir_fh_totlen]
- * name starts at buf[dir_fh_totlen + file_fh_totlen]
+ * (optional) dir2_fh starts at buf[dir_fh_totlen]
+ * (optional) file_fh starts at buf[dir_fh_totlen + dir2_fh_totlen]
+ * name starts at buf[dir_fh_totlen + dir2_fh_totlen + file_fh_totlen]
+ * ...
*/
+#define FANOTIFY_DIR_FH_SIZE(info) ((info)->dir_fh_totlen)
+#define FANOTIFY_DIR2_FH_SIZE(info) ((info)->dir2_fh_totlen)
+#define FANOTIFY_FILE_FH_SIZE(info) ((info)->file_fh_totlen)
+#define FANOTIFY_NAME_SIZE(info) ((info)->name_len + 1)
+#define FANOTIFY_NAME2_SIZE(info) ((info)->name2_len + 1)
+
+#define FANOTIFY_DIR_FH_OFFSET(info) 0
+#define FANOTIFY_DIR2_FH_OFFSET(info) \
+ (FANOTIFY_DIR_FH_OFFSET(info) + FANOTIFY_DIR_FH_SIZE(info))
+#define FANOTIFY_FILE_FH_OFFSET(info) \
+ (FANOTIFY_DIR2_FH_OFFSET(info) + FANOTIFY_DIR2_FH_SIZE(info))
+#define FANOTIFY_NAME_OFFSET(info) \
+ (FANOTIFY_FILE_FH_OFFSET(info) + FANOTIFY_FILE_FH_SIZE(info))
+#define FANOTIFY_NAME2_OFFSET(info) \
+ (FANOTIFY_NAME_OFFSET(info) + FANOTIFY_NAME_SIZE(info))
+
+#define FANOTIFY_DIR_FH_BUF(info) \
+ ((info)->buf + FANOTIFY_DIR_FH_OFFSET(info))
+#define FANOTIFY_DIR2_FH_BUF(info) \
+ ((info)->buf + FANOTIFY_DIR2_FH_OFFSET(info))
+#define FANOTIFY_FILE_FH_BUF(info) \
+ ((info)->buf + FANOTIFY_FILE_FH_OFFSET(info))
+#define FANOTIFY_NAME_BUF(info) \
+ ((info)->buf + FANOTIFY_NAME_OFFSET(info))
+#define FANOTIFY_NAME2_BUF(info) \
+ ((info)->buf + FANOTIFY_NAME2_OFFSET(info))
} __aligned(4);
static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh)
@@ -87,7 +117,21 @@ static inline struct fanotify_fh *fanotify_info_dir_fh(struct fanotify_info *inf
{
BUILD_BUG_ON(offsetof(struct fanotify_info, buf) % 4);
- return (struct fanotify_fh *)info->buf;
+ return (struct fanotify_fh *)FANOTIFY_DIR_FH_BUF(info);
+}
+
+static inline int fanotify_info_dir2_fh_len(struct fanotify_info *info)
+{
+ if (!info->dir2_fh_totlen ||
+ WARN_ON_ONCE(info->dir2_fh_totlen < FANOTIFY_FH_HDR_LEN))
+ return 0;
+
+ return info->dir2_fh_totlen - FANOTIFY_FH_HDR_LEN;
+}
+
+static inline struct fanotify_fh *fanotify_info_dir2_fh(struct fanotify_info *info)
+{
+ return (struct fanotify_fh *)FANOTIFY_DIR2_FH_BUF(info);
}
static inline int fanotify_info_file_fh_len(struct fanotify_info *info)
@@ -101,32 +145,90 @@ static inline int fanotify_info_file_fh_len(struct fanotify_info *info)
static inline struct fanotify_fh *fanotify_info_file_fh(struct fanotify_info *info)
{
- return (struct fanotify_fh *)(info->buf + info->dir_fh_totlen);
+ return (struct fanotify_fh *)FANOTIFY_FILE_FH_BUF(info);
+}
+
+static inline char *fanotify_info_name(struct fanotify_info *info)
+{
+ if (!info->name_len)
+ return NULL;
+
+ return FANOTIFY_NAME_BUF(info);
}
-static inline const char *fanotify_info_name(struct fanotify_info *info)
+static inline char *fanotify_info_name2(struct fanotify_info *info)
{
- return info->buf + info->dir_fh_totlen + info->file_fh_totlen;
+ if (!info->name2_len)
+ return NULL;
+
+ return FANOTIFY_NAME2_BUF(info);
}
static inline void fanotify_info_init(struct fanotify_info *info)
{
+ BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN + MAX_HANDLE_SZ > U8_MAX);
+ BUILD_BUG_ON(NAME_MAX > U8_MAX);
+
info->dir_fh_totlen = 0;
+ info->dir2_fh_totlen = 0;
info->file_fh_totlen = 0;
info->name_len = 0;
+ info->name2_len = 0;
+}
+
+/* These set/copy helpers MUST be called by order */
+static inline void fanotify_info_set_dir_fh(struct fanotify_info *info,
+ unsigned int totlen)
+{
+ if (WARN_ON_ONCE(info->dir2_fh_totlen > 0) ||
+ WARN_ON_ONCE(info->file_fh_totlen > 0) ||
+ WARN_ON_ONCE(info->name_len > 0) ||
+ WARN_ON_ONCE(info->name2_len > 0))
+ return;
+
+ info->dir_fh_totlen = totlen;
}
-static inline unsigned int fanotify_info_len(struct fanotify_info *info)
+static inline void fanotify_info_set_dir2_fh(struct fanotify_info *info,
+ unsigned int totlen)
{
- return info->dir_fh_totlen + info->file_fh_totlen + info->name_len;
+ if (WARN_ON_ONCE(info->file_fh_totlen > 0) ||
+ WARN_ON_ONCE(info->name_len > 0) ||
+ WARN_ON_ONCE(info->name2_len > 0))
+ return;
+
+ info->dir2_fh_totlen = totlen;
+}
+
+static inline void fanotify_info_set_file_fh(struct fanotify_info *info,
+ unsigned int totlen)
+{
+ if (WARN_ON_ONCE(info->name_len > 0) ||
+ WARN_ON_ONCE(info->name2_len > 0))
+ return;
+
+ info->file_fh_totlen = totlen;
}
static inline void fanotify_info_copy_name(struct fanotify_info *info,
const struct qstr *name)
{
+ if (WARN_ON_ONCE(name->len > NAME_MAX) ||
+ WARN_ON_ONCE(info->name2_len > 0))
+ return;
+
info->name_len = name->len;
- strcpy(info->buf + info->dir_fh_totlen + info->file_fh_totlen,
- name->name);
+ strcpy(fanotify_info_name(info), name->name);
+}
+
+static inline void fanotify_info_copy_name2(struct fanotify_info *info,
+ const struct qstr *name)
+{
+ if (WARN_ON_ONCE(name->len > NAME_MAX))
+ return;
+
+ info->name2_len = name->len;
+ strcpy(fanotify_info_name2(info), name->name);
}
/*
@@ -141,6 +243,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH,
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -170,12 +273,18 @@ static inline void fanotify_init_event(struct fanotify_event *event,
event->pid = NULL;
}
+#define FANOTIFY_INLINE_FH(name, size) \
+struct { \
+ struct fanotify_fh (name); \
+ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ unsigned char _inline_fh_buf[(size)]; \
+}
+
struct fanotify_fid_event {
struct fanotify_event fae;
__kernel_fsid_t fsid;
- struct fanotify_fh object_fh;
- /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */
- unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN];
+
+ FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN);
};
static inline struct fanotify_fid_event *
@@ -196,12 +305,30 @@ FANOTIFY_NE(struct fanotify_event *event)
return container_of(event, struct fanotify_name_event, fae);
}
+struct fanotify_error_event {
+ struct fanotify_event fae;
+ s32 error; /* Error reported by the Filesystem. */
+ u32 err_count; /* Suppressed errors count */
+
+ __kernel_fsid_t fsid; /* FSID this error refers to. */
+
+ FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ);
+};
+
+static inline struct fanotify_error_event *
+FANOTIFY_EE(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_error_event, fae);
+}
+
static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_FID)
return &FANOTIFY_FE(event)->fsid;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return &FANOTIFY_NE(event)->fsid;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->fsid;
else
return NULL;
}
@@ -213,6 +340,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh(
return &FANOTIFY_FE(event)->object_fh;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return fanotify_info_file_fh(&FANOTIFY_NE(event)->info);
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->object_fh;
else
return NULL;
}
@@ -244,6 +373,37 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event)
return info ? fanotify_info_dir_fh_len(info) : 0;
}
+static inline int fanotify_event_dir2_fh_len(struct fanotify_event *event)
+{
+ struct fanotify_info *info = fanotify_event_info(event);
+
+ return info ? fanotify_info_dir2_fh_len(info) : 0;
+}
+
+static inline bool fanotify_event_has_object_fh(struct fanotify_event *event)
+{
+ /* For error events, even zeroed fh are reported. */
+ if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return true;
+ return fanotify_event_object_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event)
+{
+ return fanotify_event_dir_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_dir2_fh(struct fanotify_event *event)
+{
+ return fanotify_event_dir2_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_any_dir_fh(struct fanotify_event *event)
+{
+ return fanotify_event_has_dir_fh(event) ||
+ fanotify_event_has_dir2_fh(event);
+}
+
struct fanotify_path_event {
struct fanotify_event fae;
struct path path;
@@ -287,6 +447,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct fanotify_event, fse);
}
+static inline bool fanotify_is_error_event(u32 mask)
+{
+ return mask & FAN_FS_ERROR;
+}
+
static inline bool fanotify_event_has_path(struct fanotify_event *event)
{
return event->type == FANOTIFY_EVENT_TYPE_PATH ||
@@ -315,7 +480,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
*/
static inline bool fanotify_is_hashed_event(u32 mask)
{
- return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
+ return !(fanotify_is_perm_event(mask) ||
+ fsnotify_is_overflow_event(mask));
}
static inline unsigned int fanotify_event_hash_bucket(
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6facdf476255..a792e21c5309 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -30,6 +30,7 @@
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
#define FANOTIFY_DEFAULT_MAX_GROUPS 128
+#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
/*
* Legacy fanotify marks limits (8192) is per group and we introduced a tunable
@@ -58,7 +59,7 @@ static int fanotify_max_queued_events __read_mostly;
static long ft_zero = 0;
static long ft_int_max = INT_MAX;
-struct ctl_table fanotify_table[] = {
+static struct ctl_table fanotify_table[] = {
{
.procname = "max_user_groups",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
@@ -87,6 +88,13 @@ struct ctl_table fanotify_table[] = {
},
{ }
};
+
+static void __init fanotify_sysctls_init(void)
+{
+ register_sysctl("fs/fanotify", fanotify_table);
+}
+#else
+#define fanotify_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
/*
@@ -114,6 +122,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
#define FANOTIFY_PIDFD_INFO_HDR_LEN \
sizeof(struct fanotify_event_info_pidfd)
+#define FANOTIFY_ERROR_INFO_LEN \
+ (sizeof(struct fanotify_event_info_error))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -126,17 +136,39 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
FANOTIFY_EVENT_ALIGN);
}
-static int fanotify_event_info_len(unsigned int info_mode,
- struct fanotify_event *event)
+/* FAN_RENAME may have one or two dir+name info records */
+static int fanotify_dir_name_info_len(struct fanotify_event *event)
{
struct fanotify_info *info = fanotify_event_info(event);
int dir_fh_len = fanotify_event_dir_fh_len(event);
- int fh_len = fanotify_event_object_fh_len(event);
+ int dir2_fh_len = fanotify_event_dir2_fh_len(event);
int info_len = 0;
+
+ if (dir_fh_len)
+ info_len += fanotify_fid_info_len(dir_fh_len,
+ info->name_len);
+ if (dir2_fh_len)
+ info_len += fanotify_fid_info_len(dir2_fh_len,
+ info->name2_len);
+
+ return info_len;
+}
+
+static size_t fanotify_event_len(unsigned int info_mode,
+ struct fanotify_event *event)
+{
+ size_t event_len = FAN_EVENT_METADATA_LEN;
+ int fh_len;
int dot_len = 0;
- if (dir_fh_len) {
- info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
+ if (!info_mode)
+ return event_len;
+
+ if (fanotify_is_error_event(event->mask))
+ event_len += FANOTIFY_ERROR_INFO_LEN;
+
+ if (fanotify_event_has_any_dir_fh(event)) {
+ event_len += fanotify_dir_name_info_len(event);
} else if ((info_mode & FAN_REPORT_NAME) &&
(event->mask & FAN_ONDIR)) {
/*
@@ -147,12 +179,14 @@ static int fanotify_event_info_len(unsigned int info_mode,
}
if (info_mode & FAN_REPORT_PIDFD)
- info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+ event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
- if (fh_len)
- info_len += fanotify_fid_info_len(fh_len, dot_len);
+ if (fanotify_event_has_object_fh(event)) {
+ fh_len = fanotify_event_object_fh_len(event);
+ event_len += fanotify_fid_info_len(fh_len, dot_len);
+ }
- return info_len;
+ return event_len;
}
/*
@@ -181,7 +215,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group,
static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- size_t event_size = FAN_EVENT_METADATA_LEN;
+ size_t event_size;
struct fanotify_event *event = NULL;
struct fsnotify_event *fsn_event;
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
@@ -194,8 +228,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
goto out;
event = FANOTIFY_E(fsn_event);
- if (info_mode)
- event_size += fanotify_event_info_len(info_mode, event);
+ event_size = fanotify_event_len(info_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
@@ -316,6 +349,27 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_error_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_error info = { };
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
+ info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.error = fee->error;
+ info.error_count = fee->err_count;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
int info_type, const char *name,
size_t name_len,
@@ -331,9 +385,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
__func__, fh_len, name_len, info_len, count);
- if (!fh_len)
- return 0;
-
if (WARN_ON_ONCE(len < sizeof(info) || len > count))
return -EFAULT;
@@ -348,6 +399,8 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
return -EFAULT;
break;
case FAN_EVENT_INFO_TYPE_DFID_NAME:
+ case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
+ case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
if (WARN_ON_ONCE(!name || !name_len))
return -EFAULT;
break;
@@ -368,6 +421,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
handle.handle_type = fh->type;
handle.handle_bytes = fh_len;
+
+ /* Mangle handle_type for bad file_handle */
+ if (!fh_len)
+ handle.handle_type = FILEID_INVALID;
+
if (copy_to_user(buf, &handle, sizeof(handle)))
return -EFAULT;
@@ -442,11 +500,19 @@ static int copy_info_records_to_user(struct fanotify_event *event,
unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
/*
- * Event info records order is as follows: dir fid + name, child fid.
+ * Event info records order is as follows:
+ * 1. dir fid + name
+ * 2. (optional) new dir fid + new name
+ * 3. (optional) child fid
*/
- if (fanotify_event_dir_fh_len(event)) {
+ if (fanotify_event_has_dir_fh(event)) {
info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
FAN_EVENT_INFO_TYPE_DFID;
+
+ /* FAN_RENAME uses special info types */
+ if (event->mask & FAN_RENAME)
+ info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
+
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
fanotify_info_dir_fh(info),
info_type,
@@ -460,7 +526,23 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
- if (fanotify_event_object_fh_len(event)) {
+ /* New dir fid+name may be reported in addition to old dir fid+name */
+ if (fanotify_event_has_dir2_fh(event)) {
+ info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
+ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+ fanotify_info_dir2_fh(info),
+ info_type,
+ fanotify_info_name2(info),
+ info->name2_len, buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ if (fanotify_event_has_object_fh(event)) {
const char *dot = NULL;
int dot_len = 0;
@@ -520,6 +602,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_error_event(event->mask)) {
+ ret = copy_error_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -537,8 +628,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- metadata.event_len = FAN_EVENT_METADATA_LEN +
- fanotify_event_info_len(info_mode, event);
+ metadata.event_len = fanotify_event_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
@@ -611,9 +701,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (fanotify_is_perm_event(event->mask))
FANOTIFY_PERM(event)->fd = fd;
- if (f)
- fd_install(fd, f);
-
if (info_mode) {
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
buf, count);
@@ -621,6 +708,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
goto out_close_fd;
}
+ if (f)
+ fd_install(fd, f);
+
return metadata.event_len;
out_close_fd:
@@ -913,17 +1003,18 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
__u32 mask, unsigned int flags,
__u32 umask, int *destroy)
{
- __u32 oldmask = 0;
+ __u32 oldmask, newmask;
/* umask bits cannot be removed by user */
mask &= ~umask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask &= ~mask;
} else {
fsn_mark->ignored_mask &= ~mask;
}
+ newmask = fsnotify_calc_mask(fsn_mark);
/*
* We need to keep the mark around even if remaining mask cannot
* result in any events (e.g. mask == FAN_ONDIR) to support incremenal
@@ -933,7 +1024,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
- return mask & oldmask;
+ return oldmask & ~newmask;
}
static int fanotify_remove_mark(struct fsnotify_group *group,
@@ -990,29 +1081,47 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
+static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
+{
+ fsn_mark->ignored_mask |= mask;
+
+ /*
+ * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
+ * the removal of the FS_MODIFY bit in calculated mask if it was set
+ * because of an ignored mask that is now going to survive FS_MODIFY.
+ */
+ if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
+ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ if (!(fsn_mark->mask & FS_MODIFY))
+ *removed = FS_MODIFY;
+ }
+}
+
static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask,
- unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
{
- __u32 oldmask = -1;
+ __u32 oldmask, newmask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask |= mask;
} else {
- fsn_mark->ignored_mask |= mask;
- if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
- fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
}
+ newmask = fsnotify_calc_mask(fsn_mark);
spin_unlock(&fsn_mark->lock);
- return mask & ~oldmask;
+ return newmask & ~oldmask;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
- unsigned int type,
+ unsigned int obj_type,
__kernel_fsid_t *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1035,7 +1144,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
}
fsnotify_init_mark(mark, group);
- ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
goto out_dec_ucounts;
@@ -1049,31 +1158,54 @@ out_dec_ucounts:
return ERR_PTR(ret);
}
+static int fanotify_group_init_error_pool(struct fsnotify_group *group)
+{
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ return 0;
+
+ return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
+ FANOTIFY_DEFAULT_FEE_POOL_SIZE,
+ sizeof(struct fanotify_error_event));
+}
static int fanotify_add_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp, unsigned int type,
+ fsnotify_connp_t *connp, unsigned int obj_type,
__u32 mask, unsigned int flags,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added;
+ __u32 added, removed = 0;
+ int ret = 0;
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
+ fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
if (IS_ERR(fsn_mark)) {
mutex_unlock(&group->mark_mutex);
return PTR_ERR(fsn_mark);
}
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- if (added & ~fsnotify_conn_mask(fsn_mark->connector))
+
+ /*
+ * Error events are pre-allocated per group, only if strictly
+ * needed (i.e. FAN_FS_ERROR was requested).
+ */
+ if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ ret = fanotify_group_init_error_pool(group);
+ if (ret)
+ goto out;
+ }
+
+ added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
+ if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
fsnotify_recalc_mask(fsn_mark->connector);
+
+out:
mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
- return 0;
+ return ret;
}
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
@@ -1207,6 +1339,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
return -EINVAL;
+ /*
+ * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
+ * and is used as an indication to report both dir and child fid on all
+ * dirent events.
+ */
+ if ((fid_mode & FAN_REPORT_TARGET_FID) &&
+ (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
+ return -EINVAL;
+
f_flags = O_RDWR | FMODE_NONOTIFY;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
@@ -1295,16 +1436,15 @@ out_destroy_group:
return fd;
}
-/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
{
__kernel_fsid_t root_fsid;
int err;
/*
- * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
*/
- err = vfs_get_fsid(path->dentry, fsid);
+ err = vfs_get_fsid(dentry, fsid);
if (err)
return err;
@@ -1312,10 +1452,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
return -ENODEV;
/*
- * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
* which uses a different fsid than sb root.
*/
- err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
if (err)
return err;
@@ -1323,6 +1463,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
root_fsid.val[1] != fsid->val[1])
return -EXDEV;
+ return 0;
+}
+
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct dentry *dentry)
+{
/*
* We need to make sure that the file system supports at least
* encoding a file handle so user can use name_to_handle_at() to
@@ -1330,8 +1476,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
* objects. However, name_to_handle_at() requires that the
* filesystem also supports decoding file handles.
*/
- if (!path->dentry->d_sb->s_export_op ||
- !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ if (!dentry->d_sb->s_export_op ||
+ !dentry->d_sb->s_export_op->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
@@ -1447,18 +1593,30 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ if (mask & FAN_FS_ERROR &&
+ mark_type != FAN_MARK_FILESYSTEM)
+ goto fput_and_out;
+
/*
- * Events with data type inode do not carry enough information to report
- * event->fd, so we do not allow setting a mask for inode events unless
- * group supports reporting fid.
- * inode events are not supported on a mount mark, because they do not
- * carry enough information (i.e. path) to be filtered by mount point.
+ * Events that do not carry enough information to report
+ * event->fd require a group that supports reporting fid. Those
+ * events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount
+ * point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & FANOTIFY_INODE_EVENTS &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
goto fput_and_out;
+ /*
+ * FAN_RENAME uses special info type records to report the old and
+ * new parent+name. Reporting only old and new parent id is less
+ * useful and was not implemented.
+ */
+ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
+ goto fput_and_out;
+
if (flags & FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
@@ -1482,7 +1640,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
if (fid_mode) {
- ret = fanotify_test_fid(&path, &__fsid);
+ ret = fanotify_test_fsid(path.dentry, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ ret = fanotify_test_fid(path.dentry);
if (ret)
goto path_put_and_out;
@@ -1495,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
+ /*
+ * FAN_RENAME is not allowed on non-dir (for now).
+ * We shouldn't have allowed setting any dirent events in mask of
+ * non-dir, but because we always allowed it, error only if group
+ * was initialized with the new flag FAN_REPORT_TARGET_FID.
+ */
+ ret = -ENOTDIR;
+ if (inode && !S_ISDIR(inode->i_mode) &&
+ ((mask & FAN_RENAME) ||
+ ((mask & FANOTIFY_DIRENT_EVENTS) &&
+ FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID))))
+ goto path_put_and_out;
+
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
@@ -1586,7 +1761,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
@@ -1604,6 +1779,7 @@ static int __init fanotify_user_setup(void)
init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
FANOTIFY_DEFAULT_MAX_GROUPS;
init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
+ fanotify_sysctls_init();
return 0;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 963e6ce75b96..70a8516b78bc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
/* for each watch, send FS_UNMOUNT and then remove it */
fsnotify_inode(inode, FS_UNMOUNT);
@@ -85,8 +84,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
}
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
}
void fsnotify_sb_delete(struct super_block *sb)
@@ -252,6 +250,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!ops->handle_inode_event))
return 0;
+ if (WARN_ON_ONCE(!inode && !dir))
+ return 0;
+
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -276,6 +277,18 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
return 0;
+ /*
+ * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
+ * The only ->handle_inode_event() backend that supports FS_RENAME is
+ * dnotify, where it means file was renamed within same parent.
+ */
+ if (mask & FS_RENAME) {
+ struct dentry *moved = fsnotify_data_dentry(data, data_type);
+
+ if (dir != moved->d_parent->d_inode)
+ return 0;
+ }
+
if (parent_mark) {
/*
* parent_mark indicates that the parent inode is watching
@@ -327,7 +340,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
/* clear ignored on inode modification */
if (mask & FS_MODIFY) {
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
@@ -337,7 +350,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
}
}
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
@@ -402,7 +415,7 @@ static unsigned int fsnotify_iter_select_report_types(
int type;
/* Choose max prio group among groups of all queue heads */
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
mark = iter_info->marks[type];
if (mark &&
fsnotify_compare_groups(max_prio_group, mark->group) > 0)
@@ -414,7 +427,7 @@ static unsigned int fsnotify_iter_select_report_types(
/* Set the report mask for marks from same group as max prio group */
iter_info->report_mask = 0;
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
mark = iter_info->marks[type];
if (mark &&
fsnotify_compare_groups(max_prio_group, mark->group) == 0)
@@ -432,7 +445,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
int type;
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
if (fsnotify_iter_should_report_type(iter_info, type))
iter_info->marks[type] =
fsnotify_next_mark(iter_info->marks[type]);
@@ -455,18 +468,20 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
* @file_name is relative to
* @file_name: optional file name associated with event
* @inode: optional inode associated with event -
- * either @dir or @inode must be non-NULL.
- * if both are non-NULL event may be reported to both.
+ * If @dir and @inode are both non-NULL, event may be
+ * reported to both.
* @cookie: inotify rename cookie
*/
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
const struct qstr *file_name, struct inode *inode, u32 cookie)
{
const struct path *path = fsnotify_data_path(data, data_type);
+ struct super_block *sb = fsnotify_data_sb(data, data_type);
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb;
struct mount *mnt = NULL;
- struct inode *parent = NULL;
+ struct inode *inode2 = NULL;
+ struct dentry *moved;
+ int inode2_type;
int ret = 0;
__u32 test_mask, marks_mask;
@@ -476,14 +491,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if (!inode) {
/* Dirent event - report on TYPE_INODE to dir */
inode = dir;
+ /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
+ if (mask & FS_RENAME) {
+ moved = fsnotify_data_dentry(data, data_type);
+ inode2 = moved->d_parent->d_inode;
+ inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
+ }
} else if (mask & FS_EVENT_ON_CHILD) {
/*
* Event on child - report on TYPE_PARENT to dir if it is
* watching children and on TYPE_INODE to child.
*/
- parent = dir;
+ inode2 = dir;
+ inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
}
- sb = inode->i_sb;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
@@ -495,7 +516,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if (!sb->s_fsnotify_marks &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
- (!parent || !parent->i_fsnotify_marks))
+ (!inode2 || !inode2->i_fsnotify_marks))
return 0;
marks_mask = sb->s_fsnotify_mask;
@@ -503,33 +524,35 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
marks_mask |= mnt->mnt_fsnotify_mask;
if (inode)
marks_mask |= inode->i_fsnotify_mask;
- if (parent)
- marks_mask |= parent->i_fsnotify_mask;
+ if (inode2)
+ marks_mask |= inode2->i_fsnotify_mask;
/*
- * if this is a modify event we may need to clear the ignored masks
- * otherwise return if none of the marks care about this type of event.
+ * If this is a modify event we may need to clear some ignored masks.
+ * In that case, the object with ignored masks will have the FS_MODIFY
+ * event in its mask.
+ * Otherwise, return if none of the marks care about this type of event.
*/
test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
+ if (!(test_mask & marks_mask))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
- iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
+ iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
}
if (inode) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
+ iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
fsnotify_first_mark(&inode->i_fsnotify_marks);
}
- if (parent) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
- fsnotify_first_mark(&parent->i_fsnotify_marks);
+ if (inode2) {
+ iter_info.marks[inode2_type] =
+ fsnotify_first_mark(&inode2->i_fsnotify_marks);
}
/*
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fb89c351295d..b7d4d64f87c2 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -58,7 +58,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
fsnotify_group_stop_queueing(group);
/* Clear all marks for this group and queue them for destruction */
- fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK);
+ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_ANY);
/*
* Some marks can still be pinned when waiting for response from
@@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
* that deliberately ignores overflow events.
*/
if (group->overflow_event)
- group->ops->free_event(group->overflow_event);
+ group->ops->free_event(group, group->overflow_event);
fsnotify_put_group(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d1a64daa0171..d92d7b0adc9a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
if (len)
strcpy(event->name, name->name);
- ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
+ ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
dec_inotify_instances(group->inotify_data.ucounts);
}
-static void inotify_free_event(struct fsnotify_event *fsn_event)
+static void inotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
kfree(INOTIFY_E(fsn_event));
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 62051247f6d2..54583f62dc44 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
static long it_zero = 0;
static long it_int_max = INT_MAX;
-struct ctl_table inotify_table[] = {
+static struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
@@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = {
},
{ }
};
+
+static void __init inotify_sysctls_init(void)
+{
+ register_sysctl("fs/inotify", inotify_table);
+}
+
+#else
+#define inotify_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
@@ -94,10 +102,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
__u32 mask;
/*
- * Everything should accept their own ignored and should receive events
- * when the inode is unmounted. All directories care about children.
+ * Everything should receive events when the inode is unmounted.
+ * All directories care about children.
*/
- mask = (FS_IN_IGNORED | FS_UNMOUNT);
+ mask = (FS_UNMOUNT);
if (S_ISDIR(inode->i_mode))
mask |= FS_EVENT_ON_CHILD;
@@ -849,6 +857,7 @@ static int __init inotify_user_setup(void)
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
+ inotify_sysctls_init();
return 0;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fa1d99101f89..4853184f7dde 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -127,7 +127,7 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
return;
hlist_for_each_entry(mark, &conn->list, obj_list) {
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= mark->mask;
+ new_mask |= fsnotify_calc_mask(mark);
}
*fsnotify_conn_mask_p(conn) = new_mask;
}
@@ -353,7 +353,7 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
{
int type;
- fsnotify_foreach_obj_type(type) {
+ fsnotify_foreach_iter_type(type) {
/* This can fail if mark is being removed */
if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
__release(&fsnotify_mark_srcu);
@@ -382,7 +382,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
int type;
iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
- fsnotify_foreach_obj_type(type)
+ fsnotify_foreach_iter_type(type)
fsnotify_put_mark_wake(iter_info->marks[type]);
}
@@ -496,7 +496,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int type,
+ unsigned int obj_type,
__kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
@@ -507,7 +507,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
return -ENOMEM;
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
- conn->type = type;
+ conn->type = obj_type;
conn->obj = connp;
/* Cache fsid of filesystem containing the object */
if (fsid) {
@@ -572,7 +572,8 @@ out:
* priority, highest number first, and then by the group's location in memory.
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp, unsigned int type,
+ fsnotify_connp_t *connp,
+ unsigned int obj_type,
int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
@@ -580,7 +581,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
int cmp;
int err = 0;
- if (WARN_ON(!fsnotify_valid_obj_type(type)))
+ if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
return -EINVAL;
/* Backend is expected to check for zero fsid (e.g. tmpfs) */
@@ -592,7 +593,8 @@ restart:
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, type, fsid);
+ err = fsnotify_attach_connector_to_object(connp, obj_type,
+ fsid);
if (err)
return err;
goto restart;
@@ -665,7 +667,7 @@ out_err:
* event types should be delivered to which group.
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp, unsigned int type,
+ fsnotify_connp_t *connp, unsigned int obj_type,
int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
@@ -686,11 +688,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid);
if (ret)
goto err;
- if (mark->mask)
+ if (mark->mask || mark->ignored_mask)
fsnotify_recalc_mask(mark->connector);
return ret;
@@ -706,13 +708,14 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
+ unsigned int obj_type, int allow_dups,
+ __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
@@ -747,14 +750,14 @@ EXPORT_SYMBOL_GPL(fsnotify_find_mark);
/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
- unsigned int type_mask)
+ unsigned int obj_type)
{
struct fsnotify_mark *lmark, *mark;
LIST_HEAD(to_free);
struct list_head *head = &to_free;
/* Skip selection step if we want to clear all marks. */
- if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) {
+ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
head = &group->marks_list;
goto clear;
}
@@ -769,7 +772,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
*/
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
- if ((1U << mark->connector->type) & type_mask)
+ if (mark->connector->type == obj_type)
list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 32f45543b9c6..9022ae650cf8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
WARN_ON(!list_empty(&event->list));
spin_unlock(&group->notification_lock);
}
- group->ops->free_event(event);
+ group->ops->free_event(group, event);
}
/*
@@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
* 2 if the event was not queued - either the queue of events has overflown
* or the group is shutting down.
*/
-int fsnotify_add_event(struct fsnotify_group *group,
- struct fsnotify_event *event,
- int (*merge)(struct fsnotify_group *,
- struct fsnotify_event *),
- void (*insert)(struct fsnotify_group *,
- struct fsnotify_event *))
+int fsnotify_insert_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
+ int (*merge)(struct fsnotify_group *,
+ struct fsnotify_event *),
+ void (*insert)(struct fsnotify_group *,
+ struct fsnotify_event *))
{
int ret = 0;
struct list_head *list = &group->notification_list;
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index 1667a7e590d8..f93e69a61283 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -52,6 +52,7 @@ config NTFS_DEBUG
config NTFS_RW
bool "NTFS write support"
depends on NTFS_FS
+ depends on PAGE_SIZE_LESS_THAN_64KB
help
This enables the partial, but safe, write support in the NTFS driver.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index bb0a43860ad2..90e3dad8ee45 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -593,12 +593,12 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
iblock = initialized_size >> blocksize_bits;
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -653,7 +653,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
// Update initialized size in the attribute and
// in the inode.
// Again, for each page do:
- // __set_page_dirty_buffers();
+ // block_dirty_folio();
// put_page()
// We don't need to wait on the writes.
// Update iblock.
@@ -1350,12 +1350,13 @@ retry_writepage:
/* Is the page fully outside i_size? (truncate in progress) */
if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT)) {
+ struct folio *folio = page_folio(page);
/*
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ block_invalidate_folio(folio, 0, folio_size(folio));
+ folio_unlock(folio);
ntfs_debug("Write outside i_size - truncated?");
return 0;
}
@@ -1653,7 +1654,7 @@ const struct address_space_operations ntfs_normal_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.bmap = ntfs_bmap,
.migratepage = buffer_migrate_page,
@@ -1668,7 +1669,7 @@ const struct address_space_operations ntfs_compressed_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1683,9 +1684,7 @@ const struct address_space_operations ntfs_mst_aops = {
.readpage = ntfs_readpage, /* Fill page with data. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
- .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
- without touching the buffers
- belonging to the page. */
+ .dirty_folio = filemap_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1747,7 +1746,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
set_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
spin_unlock(&mapping->private_lock);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(mapping, page_folio(page));
if (unlikely(buffers_to_free)) {
do {
bh = buffers_to_free->b_this_page;
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index d563abc3e136..2911c04a33e0 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
*
* Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ab4f3362466d..2ae25e48a41a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*/
+#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
@@ -1829,7 +1830,7 @@ again:
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4474adb393ca..efe0602b4e51 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
+ ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0d7e948cb29c..5ae8de09b271 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
ntfs_debug("Set device block size to %i bytes (block size bits %i).",
blocksize, sb->s_blocksize_bits);
/* Determine the size of the device in units of block_size bytes. */
- if (!i_size_read(sb->s_bdev->bd_inode)) {
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ if (!vol->nr_blocks) {
if (!silent)
ntfs_error(sb, "Unable to determine device size.");
goto err_out_now;
}
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
/* Read the boot sector and return unlocked buffer head to it. */
if (!(bh = read_ntfs_boot_sector(sb, silent))) {
if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
goto err_out_now;
}
BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
ntfs_debug("Changed device block size to %i bytes (block size "
"bits %i) to match volume sector size.",
blocksize, sb->s_blocksize_bits);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index f8360f9bfaf0..3bae76930e68 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,6 +8,7 @@
*/
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
@@ -995,7 +996,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
- if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
err = -EFAULT;
goto out;
}
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4de9acb16968..3de5700a9b83 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
return err;
}
-static inline struct bio *ntfs_alloc_bio(u32 nr_vecs)
-{
- struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
- }
- return bio;
-}
-
/*
* ntfs_bio_pages - Read/write pages from/to disk.
*/
@@ -1496,19 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
lbo = ((u64)lcn << cluster_bits) + off;
len = ((u64)clen << cluster_bits) - off;
new_bio:
- new = ntfs_alloc_bio(nr_pages - page_idx);
- if (!new) {
- err = -ENOMEM;
- goto out;
- }
+ new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = lbo >> 9;
- bio->bi_opf = op;
while (len) {
off = vbo & (PAGE_SIZE - 1);
@@ -1599,18 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
lbo = (u64)lcn << cluster_bits;
len = (u64)clen << cluster_bits;
new_bio:
- new = ntfs_alloc_bio(BIO_MAX_VECS);
- if (!new) {
- err = -ENOMEM;
- break;
- }
+ new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = lbo >> 9;
for (;;) {
@@ -1626,11 +1603,10 @@ new_bio:
}
} while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
- if (bio) {
- if (!err)
- err = submit_bio_wait(bio);
- bio_put(bio);
- }
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+
blk_finish_plug(&plug);
out:
unlock_page(fill);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 879952254071..76519c72a970 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1050,7 +1050,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
if (!ret && i2)
ret = writeback_inode(i2);
if (!ret)
- ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
@@ -1954,7 +1954,7 @@ const struct address_space_operations ntfs_aops = {
.write_end = ntfs_write_end,
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
};
const struct address_space_operations ntfs_aops_cmpr = {
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 8aaec7e0804e..fb825059d488 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -11,7 +11,6 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
-#include <linux/cleancache.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index d41d76979e12..278dcf502410 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep;
static struct inode *ntfs_alloc_inode(struct super_block *sb)
{
- struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS);
+ struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);
if (!ni)
return NULL;
@@ -921,7 +921,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Parse boot. */
err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
- bdev->bd_inode->i_size);
+ bdev_nr_bytes(bdev));
if (err)
goto out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5d9ae17bd443..49f41074baad 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
- struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+ struct buffer_head *root_bh;
/*
* Update the counts and position values within all the
@@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
data_alloc_bh, start_blk,
num_clusters);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5979,7 +5981,7 @@ bail:
return status;
}
-/* Expects you to already be holding tl_inode->i_mutex */
+/* Expects you to already be holding tl_inode->i_rwsem */
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
@@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
}
/*
- * Zero the area past i_size but still within an allocated
- * cluster. This avoids exposing nonzero data on subsequent file
- * extends.
+ * Zero partial cluster for a hole punch or truncate. This avoids exposing
+ * nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
* otherwise block_write_full_page() will skip writeout of pages past
- * i_size. The new_i_size parameter is passed for this reason.
+ * i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end)
@@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
return 0;
+ /*
+ * Avoid zeroing pages fully beyond current i_size. It is pointless as
+ * underlying blocks of those pages should be already zeroed out and
+ * page writeback will skip them anyway.
+ */
+ range_end = min_t(u64, range_end, i_size_read(inode));
+ if (range_start >= range_end)
+ return 0;
+
pages = kcalloc(ocfs2_pages_per_cluster(sb),
sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
@@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
goto out;
}
- if (range_start == range_end)
- goto out;
-
ret = ocfs2_extent_map_get_blocks(inode,
range_start >> sb->s_blocksize_bits,
&phys, NULL, &ext_flags);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 68d11c295dd3..4b9af65cb61b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1799,20 +1799,20 @@ try_again:
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret && ret != -EAGAIN) {
- mlog_errno(ret);
- goto out_quota;
- }
+ if (ret) {
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
- /*
- * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
- * the target page. In this case, we exit with no error and no target
- * page. This will trigger the caller, page_mkwrite(), to re-try
- * the operation.
- */
- if (ret == -EAGAIN) {
- BUG_ON(wc->w_target_page);
- ret = 0;
+ mlog_errno(ret);
goto out_quota;
}
@@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
down_write(&oi->ip_alloc_sem);
- /* Delete orphan before acquire i_mutex. */
+ /* Delete orphan before acquire i_rwsem. */
if (dwc->dw_orphaned) {
BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
@@ -2453,7 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations ocfs2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
.readpage = ocfs2_readpage,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
@@ -2461,7 +2461,7 @@ const struct address_space_operations ocfs2_aops = {
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f89ffcbd585f..ea0e70c0fce0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -379,7 +379,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
/* lowest node as master node to make negotiate decision. */
- master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
if (master_node == o2nm_this_node()) {
if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
@@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio_set_dev(bio, reg->hr_bdev);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- bio_set_op_attrs(bio, op, op_flags);
vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 810d32815593..563881ddbf00 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(KTHREAD),
};
-static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, };
+ATTRIBUTE_GROUPS(mlog_default);
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
char *buf)
@@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = {
};
static struct kobj_type mlog_ktype = {
- .default_attrs = mlog_attr_ptrs,
- .sysfs_ops = &mlog_attr_ops,
+ .default_groups = mlog_default_groups,
+ .sysfs_ops = &mlog_attr_ops,
};
static struct kset mlog_kset = {
@@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset)
int i = 0;
while (mlog_attrs[i].attr.mode) {
- mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+ mlog_default_attrs[i] = &mlog_attrs[i].attr;
i++;
}
- mlog_attr_ptrs[i] = NULL;
+ mlog_default_attrs[i] = NULL;
kobject_set_name(&mlog_kset.kobj, "logmask");
mlog_kset.kobj.kset = o2cb_kset;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 625c92521416..27fee68f860a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- /* this runs under the parent dir's i_mutex; there can be only
+ /* this runs under the parent dir's i_rwsem; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index bd8d534f11cb..81c3d65d68fe 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1957,7 +1957,7 @@ bail_nolock:
}
/*
- * NOTE: this should always be called with parent dir i_mutex taken.
+ * NOTE: this should always be called with parent dir i_rwsem taken.
*/
int ocfs2_find_files_on_disk(const char *name,
int namelen,
@@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
* Return 0 if the name does not exist
* Return -EEXIST if the directory contains the name
*
- * Callers should have i_mutex + a cluster lock on dir
+ * Callers should have i_rwsem + a cluster lock on dir
*/
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
@@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_dir_entry *de, *last_de = NULL;
char *de_buf, *limit;
unsigned long offset = 0;
- unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+ unsigned int rec_len, new_rec_len, free_space;
/*
* This calculates how many free bytes we'd have in block zero, should
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 9f90fc9551e1..c4eccd499db8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1045,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
int status, ret = 0, i;
char *p;
- if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES)
goto bail;
qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
@@ -1217,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
struct o2nm_node *node;
int ret = 0, status, count, i;
- if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES)
goto bail;
qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9b88219febb5..227da5b1b6ab 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -861,7 +861,7 @@ lookup:
* to see if there are any nodes that still need to be
* considered. these will not appear in the mle nodemap
* but they might own this lockres. wait on them. */
- bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
@@ -912,7 +912,7 @@ redo_request:
dlm_wait_for_recovery(dlm);
spin_lock(&dlm->spinlock);
- bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
@@ -1079,7 +1079,7 @@ recheck:
sleep = 1;
/* have all nodes responded? */
if (voting_done && !*blocked) {
- bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (dlm->node_num <= bit) {
/* my node number is lowest.
* now tell other nodes that I am
@@ -1234,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
} else {
mlog(ML_ERROR, "node down! %d\n", node);
if (blocked) {
- int lowest = find_next_bit(mle->maybe_map,
- O2NM_MAX_NODES, 0);
+ int lowest = find_first_bit(mle->maybe_map,
+ O2NM_MAX_NODES);
/* act like it was never there */
clear_bit(node, mle->maybe_map);
@@ -1795,7 +1795,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
"MLE for it! (%.*s)\n", assert->node_idx,
namelen, name);
} else {
- int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+ int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (bit >= O2NM_MAX_NODES) {
/* not necessarily an error, though less likely.
* could be master just re-asserting. */
@@ -2521,7 +2521,7 @@ static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
}
if (!nonlocal) {
- node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
@@ -3303,7 +3303,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
BUG_ON(mle->type != DLM_MLE_BLOCK);
spin_lock(&mle->spinlock);
- bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (bit != dead_node) {
mlog(0, "mle found, but dead node %u would not have been "
"master\n", dead_node);
@@ -3542,7 +3542,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
spin_lock(&dlm->master_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
- BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+ BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES));
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0e7aad1b11cc..52ad342fec3e 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -451,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
@@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
continue;
}
retry:
- ret = -EINVAL;
mlog(0, "attempting to send begin reco msg to %d\n",
nodenum);
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c350bd4df770..eedf07ca23ca 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,7 +92,7 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
return 0;
/* Another node has this resource with this node as the master */
- bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(res->refmap, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES)
return 0;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index fa0a14f199eb..e360543ad7e7 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -280,7 +280,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
- ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+ ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 54d7843c0211..01b7407a8893 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
/*
* Don't use ocfs2_mark_inode_dirty() here as we don't always
- * have i_mutex to guard against concurrent changes to other
+ * have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
inode->i_atime = current_time(inode);
@@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
@@ -536,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int ret;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
- ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
- clusters_to_add, mark_unwritten,
- data_ac, meta_ac, reason_ret);
-
- return ret;
+ return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
}
static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -1064,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode,
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
- * i_mutex to block other extend/truncate calls while we're
+ * i_rwsem to block other extend/truncate calls while we're
* here. We even have to hold it for sparse files because there
* might be some tail zeroing.
*/
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index de56e6231af8..1ad7106741f8 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = {
&ocfs2_filecheck_attr_set.attr,
NULL
};
+ATTRIBUTE_GROUPS(ocfs2_filecheck);
static void ocfs2_filecheck_release(struct kobject *kobj)
{
@@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = {
};
static struct kobj_type ocfs2_ktype_filecheck = {
- .default_attrs = ocfs2_filecheck_attrs,
+ .default_groups = ocfs2_filecheck_groups,
.sysfs_ops = &ocfs2_filecheck_ops,
.release = ocfs2_filecheck_release,
};
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index bc8f32fab964..5739dc301569 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
- journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
- if (journal) {
+ if (osb->journal) {
transaction_t *transaction;
tid_t tid;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
@@ -713,7 +713,7 @@ bail:
/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
- * i_mutex held, we'll deadlock here. Instead we detect this
+ * i_rwsem held, we'll deadlock here. Instead we detect this
* and exit early - recovery will wipe this inode for us.
*/
static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f15750aac5d..1887a2708709 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{
int status = -1;
struct inode *inode = NULL; /* the journal inode */
journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = NULL;
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
- struct ocfs2_super *osb;
int inode_lock = 0;
- BUG_ON(!journal);
+ /* initialize our journal structure */
+ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+ if (!journal) {
+ mlog(ML_ERROR, "unable to alloc journal\n");
+ status = -ENOMEM;
+ goto done;
+ }
+ osb->journal = journal;
+ journal->j_osb = osb;
- osb = journal->j_osb;
+ atomic_set(&journal->j_num_trans, 0);
+ init_rwsem(&journal->j_trans_barrier);
+ init_waitqueue_head(&journal->j_checkpointed);
+ spin_lock_init(&journal->j_lock);
+ journal->j_trans_id = 1UL;
+ INIT_LIST_HEAD(&journal->j_la_cleanups);
+ INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+ journal->j_state = OCFS2_JOURNAL_FREE;
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
@@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
journal->j_state = OCFS2_JOURNAL_FREE;
-// up_write(&journal->j_trans_barrier);
done:
iput(inode);
+ kfree(journal);
+ osb->journal = NULL;
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1497,10 +1513,7 @@ bail:
if (quota_enabled)
kfree(rm_quota);
- /* no one is callint kthread_stop() for us so the kthread() api
- * requires that we call do_exit(). And it isn't exported, but
- * complete_and_exit() seems to be a minimal wrapper around it. */
- complete_and_exit(NULL, status);
+ return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1656,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
status = jbd2_journal_load(journal);
if (status < 0) {
mlog_errno(status);
- if (!igrab(inode))
- BUG();
+ BUG_ON(!igrab(inode));
jbd2_journal_destroy(journal);
goto done;
}
@@ -1686,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
if (status < 0)
mlog_errno(status);
- if (!igrab(inode))
- BUG();
+ BUG_ON(!igrab(inode));
jbd2_journal_destroy(journal);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d158acb8b38a..8dcb2f2cadbc 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
-int ocfs2_journal_init(struct ocfs2_journal *journal,
- int *dirty);
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f6bacbeef6b..c4426d12a2ad 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -606,7 +606,7 @@ out:
/*
* make sure we've got at least bits_wanted contiguous bits in the
- * local alloc. You lose them when you drop i_mutex.
+ * local alloc. You lose them when you drop i_rwsem.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
@@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* We must double check state and allocator bits because
- * another process may have changed them while holding i_mutex.
+ * another process may have changed them while holding i_rwsem.
*/
spin_lock(&osb->osb_lock);
if (!ocfs2_la_state_enabled(osb) ||
@@ -1029,7 +1029,7 @@ enum ocfs2_la_event {
/*
* Given an event, calculate the size of our next local alloc window.
*
- * This should always be called under i_mutex of the local alloc inode
+ * This should always be called under i_rwsem of the local alloc inode
* so that local alloc disabling doesn't race with processes trying to
* use the allocator.
*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..c75fd54b9185 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -476,7 +476,7 @@ leave:
ocfs2_free_alloc_context(meta_ac);
/*
- * We should call iput after the i_mutex of the bitmap been
+ * We should call iput after the i_rwsem of the bitmap been
* unlocked in ocfs2_free_alloc_context, or the
* ocfs2_delete_inode will mutex_lock again.
*/
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bb62cc2e0211..337527571461 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -355,7 +355,7 @@ struct ocfs2_super
struct delayed_work la_enable_wq;
/*
- * Must hold local alloc i_mutex and osb->osb_lock to change
+ * Must hold local alloc i_rwsem and osb->osb_lock to change
* local_alloc_bits. Reads can be done under either lock.
*/
unsigned int local_alloc_bits;
@@ -430,7 +430,7 @@ struct ocfs2_super
atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
- * It must be protected by osb_tl_inode->i_mutex.
+ * It must be protected by osb_tl_inode->i_rwsem.
*/
unsigned int truncated_clusters;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f033de733adb..0b6f551a342a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -36,7 +36,7 @@
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
* by dqio_sem or dquot->dq_lock.
- * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * - any modification of global quota file holds inode cluster lock, i_rwsem,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
* - an allocation of new blocks for local quota file is protected by
@@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
/* Read information header from global quota file */
int ocfs2_global_read_info(struct super_block *sb, int type)
{
- struct inode *gqinode = NULL;
unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
@@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
u64 pcount;
int status;
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+
/* Read global header */
- gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
- if (!gqinode) {
+ if (!oinfo->dqi_gqinode) {
mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
type);
status = -EINVAL;
goto out_err;
}
- oinfo->dqi_gi.dqi_sb = sb;
- oinfo->dqi_gi.dqi_type = type;
- oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
- oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
- oinfo->dqi_gqi_bh = NULL;
- oinfo->dqi_gqi_count = 0;
- oinfo->dqi_gqinode = gqinode;
+
status = ocfs2_lock_global_qf(oinfo, 0);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
- status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
if (status < 0)
goto out_unlock;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 0e4b16d4c037..b1a8b046f4c2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
- oinfo->dqi_gqinode = NULL;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 85a47621e0c0..a75e2b7d67f5 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 16f1bfc407f2..dd77b7aaabf5 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -661,42 +661,8 @@ static struct ctl_table ocfs2_nm_table[] = {
{ }
};
-static struct ctl_table ocfs2_mod_table[] = {
- {
- .procname = "nm",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_nm_table
- },
- { }
-};
-
-static struct ctl_table ocfs2_kern_table[] = {
- {
- .procname = "ocfs2",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_mod_table
- },
- { }
-};
-
-static struct ctl_table ocfs2_root_table[] = {
- {
- .procname = "fs",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_kern_table
- },
- { }
-};
-
static struct ctl_table_header *ocfs2_table_header;
-
/*
* Initialization
*/
@@ -705,7 +671,7 @@ static int __init ocfs2_stack_glue_init(void)
{
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
- ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+ ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
if (!ocfs2_table_header) {
printk(KERN_ERR
"ocfs2 stack glue: unable to register sysctl\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 481017e1dac5..166c8918c825 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,26 +1251,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct journal_head *jh;
- int ret = 1;
+ int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
- if (!buffer_jbd(bg_bh))
+ jh = jbd2_journal_grab_journal_head(bg_bh);
+ if (!jh)
return 1;
- jbd_lock_bh_journal_head(bg_bh);
- if (buffer_jbd(bg_bh)) {
- jh = bh2jh(bg_bh);
- spin_lock(&jh->b_state_lock);
- bg = (struct ocfs2_group_desc *) jh->b_committed_data;
- if (bg)
- ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
- else
- ret = 1;
- spin_unlock(&jh->b_state_lock);
- }
- jbd_unlock_bh_journal_head(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
return ret;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c914ce9b3ac..477cdf94122e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -25,7 +25,6 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
-#include <linux/cleancache.h>
#include <linux/signal.h>
#define CREATE_TRACE_POINTS
@@ -549,7 +548,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
{
struct ocfs2_inode_info *oi;
- oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+ oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS);
if (!oi)
return NULL;
@@ -1106,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- root = d_make_root(inode);
- if (!root) {
- status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
- }
-
- sb->s_root = root;
-
- ocfs2_complete_mount_recovery(osb);
-
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
&ocfs2_kset->kobj);
if (!osb->osb_dev_kset) {
@@ -1134,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
+ root = d_make_root(inode);
+ if (!root) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
+ sb->s_root = root;
+
+ ocfs2_complete_mount_recovery(osb);
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
@@ -1894,8 +1893,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
- ocfs2_journal_shutdown(osb);
-
ocfs2_sync_blockdev(sb);
ocfs2_purge_refcount_trees(osb);
@@ -1918,6 +1915,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
+ ocfs2_journal_shutdown(osb);
+
/*
* If we're dismounting due to mount error, mount.ocfs2 will clean
* up heartbeat. If we're a local mount, there is no heartbeat.
@@ -2016,7 +2015,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct ocfs2_journal *journal;
struct ocfs2_super *osb;
u64 total_blocks;
@@ -2197,33 +2195,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
- /* FIXME
- * This should be done in ocfs2_journal_init(), but unknown
- * ordering issues will cause the filesystem to crash.
- * If anyone wants to figure out what part of the code
- * refers to osb->journal before ocfs2_journal_init() is run,
- * be my guest.
- */
- /* initialize our journal structure */
-
- journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
- if (!journal) {
- mlog(ML_ERROR, "unable to alloc journal\n");
- status = -ENOMEM;
- goto bail;
- }
- osb->journal = journal;
- journal->j_osb = osb;
-
- atomic_set(&journal->j_num_trans, 0);
- init_rwsem(&journal->j_trans_barrier);
- init_waitqueue_head(&journal->j_checkpointed);
- spin_lock_init(&journal->j_lock);
- journal->j_trans_id = (unsigned long) 1;
- INIT_LIST_HEAD(&journal->j_la_cleanups);
- INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
- journal->j_state = OCFS2_JOURNAL_FREE;
-
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
init_llist_head(&osb->dquot_drop_list);
@@ -2311,7 +2282,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs(sb);
osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
if (!osb->ocfs2_wq) {
@@ -2404,7 +2374,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves. */
/* Init our journal object. */
- status = ocfs2_journal_init(osb->journal, &dirty);
+ status = ocfs2_journal_init(osb, &dirty);
if (status < 0) {
mlog(ML_ERROR, "Could not initialize journal!\n");
goto finally;
@@ -2513,12 +2483,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
- /* FIXME
- * This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initialize_osb(),
- * we free it here.
- */
- kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
kfree(osb->vol_label);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..95d0611c5fc7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7205,7 +7205,7 @@ out:
* Used for reflink a non-preserve-security file.
*
* It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
+ * must not hold any lock expect i_rwsem.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 89725b15a64b..3f297b541713 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,8 @@ const struct inode_operations omfs_file_inops = {
};
const struct address_space_operations omfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = omfs_readpage,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
diff --git a/fs/open.c b/fs/open.c
index daa324606a41..1315253e0247 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -32,6 +32,7 @@
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
+#include <linux/mnt_idmapping.h>
#include "internal.h"
@@ -640,7 +641,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
int chown_common(const struct path *path, uid_t user, gid_t group)
{
- struct user_namespace *mnt_userns;
+ struct user_namespace *mnt_userns, *fs_userns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
int error;
@@ -652,8 +653,9 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
gid = make_kgid(current_user_ns(), group);
mnt_userns = mnt_user_ns(path->mnt);
- uid = kuid_from_mnt(mnt_userns, uid);
- gid = kgid_from_mnt(mnt_userns, gid);
+ fs_userns = i_user_ns(inode);
+ uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
+ gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
@@ -833,7 +835,6 @@ static int do_dentry_open(struct file *f,
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
- f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -856,8 +857,20 @@ static int do_dentry_open(struct file *f,
* of THPs into the page cache will fail.
*/
smp_mb();
- if (filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (filemap_nr_thps(inode->i_mapping)) {
+ struct address_space *mapping = inode->i_mapping;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ /*
+ * unmap_mapping_range just need to be called once
+ * here, because the private pages is not need to be
+ * unmapped mapping (e.g. data segment of dynamic
+ * shared libraries here).
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
return 0;
@@ -1248,6 +1261,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
if (err)
return err;
+ audit_openat2_how(&tmp);
+
/* O_LARGEFILE is only allowed for non-O_PATH. */
if (!(tmp.flags & O_PATH) && force_o_largefile())
tmp.flags |= O_LARGEFILE;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index f825176ff4ed..f0b7f4d51a17 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -335,7 +335,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
{
struct op_inode_info *oi;
- oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
+ oi = alloc_inode_sb(sb, op_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index fe484cf93e5c..8bbe9486e3a6 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
- if (!new_op)
+ if (!new_op) {
+ ret = -ENOMEM;
goto out_put_parent;
+ }
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index c1bb4c4b5d67..79c1025d18ea 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
* Linux VFS inode operations.
*/
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
@@ -46,7 +46,7 @@ static int orangefs_writepage_locked(struct page *page,
else
wlen = PAGE_SIZE;
}
- /* Should've been handled in orangefs_invalidatepage. */
+ /* Should've been handled in orangefs_invalidate_folio. */
WARN_ON(off == len || off + wlen > len);
bv.bv_page = page;
@@ -243,7 +243,7 @@ static int orangefs_writepages(struct address_space *mapping,
return ret;
}
-static int orangefs_launder_page(struct page *);
+static int orangefs_launder_folio(struct folio *);
static void orangefs_readahead(struct readahead_control *rac)
{
@@ -290,14 +290,15 @@ static void orangefs_readahead(struct readahead_control *rac)
static int orangefs_readpage(struct file *file, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct iov_iter iter;
struct bio_vec bv;
ssize_t ret;
loff_t off; /* offset into this page */
- if (PageDirty(page))
- orangefs_launder_page(page);
+ if (folio_test_dirty(folio))
+ orangefs_launder_folio(folio);
off = page_offset(page);
bv.bv_page = page;
@@ -330,6 +331,7 @@ static int orangefs_write_begin(struct file *file,
void **fsdata)
{
struct orangefs_write_range *wr;
+ struct folio *folio;
struct page *page;
pgoff_t index;
int ret;
@@ -341,27 +343,28 @@ static int orangefs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
+ folio = page_folio(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
* Should be impossible. If it happens, launder the page
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ wr = folio_get_private(folio);
if (wr->pos + wr->len == pos &&
uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
wr->len += len;
goto okay;
} else {
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
@@ -375,7 +378,7 @@ static int orangefs_write_begin(struct file *file,
wr->len = len;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
return 0;
}
@@ -415,47 +418,45 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static void orangefs_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void orangefs_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ struct orangefs_write_range *wr = folio_get_private(folio);
if (offset == 0 && length == PAGE_SIZE) {
- kfree(detach_page_private(page));
+ kfree(folio_detach_private(folio));
return;
/* write range entirely within invalidate range (or equal) */
- } else if (page_offset(page) + offset <= wr->pos &&
- wr->pos + wr->len <= page_offset(page) + offset + length) {
- kfree(detach_page_private(page));
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length) {
+ kfree(folio_detach_private(folio));
/* XXX is this right? only caller in fs */
- cancel_dirty_page(page);
+ folio_cancel_dirty(folio);
return;
/* invalidate range chops off end of write range */
- } else if (wr->pos < page_offset(page) + offset &&
- wr->pos + wr->len <= page_offset(page) + offset + length &&
- page_offset(page) + offset < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length &&
+ folio_pos(folio) + offset < wr->pos + wr->len) {
size_t x;
- x = wr->pos + wr->len - (page_offset(page) + offset);
+ x = wr->pos + wr->len - (folio_pos(folio) + offset);
WARN_ON(x > wr->len);
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range chops off beginning of write range */
- } else if (page_offset(page) + offset <= wr->pos &&
- page_offset(page) + offset + length < wr->pos + wr->len &&
- wr->pos < page_offset(page) + offset + length) {
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len &&
+ wr->pos < folio_pos(folio) + offset + length) {
size_t x;
- x = page_offset(page) + offset + length - wr->pos;
+ x = folio_pos(folio) + offset + length - wr->pos;
WARN_ON(x > wr->len);
wr->pos += x;
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range entirely within write range (punch hole) */
- } else if (wr->pos < page_offset(page) + offset &&
- page_offset(page) + offset + length < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len) {
/* XXX what do we do here... should not WARN_ON */
WARN_ON(1);
/* punch hole */
@@ -467,11 +468,11 @@ static void orangefs_invalidatepage(struct page *page,
/* non-overlapping ranges */
} else {
/* WARN if they do overlap */
- if (!((page_offset(page) + offset + length <= wr->pos) ^
- (wr->pos + wr->len <= page_offset(page) + offset))) {
+ if (!((folio_pos(folio) + offset + length <= wr->pos) ^
+ (wr->pos + wr->len <= folio_pos(folio) + offset))) {
WARN_ON(1);
- printk("invalidate range offset %llu length %u\n",
- page_offset(page) + offset, length);
+ printk("invalidate range offset %llu length %zu\n",
+ folio_pos(folio) + offset, length);
printk("write range offset %llu length %zu\n",
wr->pos, wr->len);
}
@@ -483,7 +484,7 @@ static void orangefs_invalidatepage(struct page *page,
* Thus the following runs if wr was modified above.
*/
- orangefs_launder_page(page);
+ orangefs_launder_folio(folio);
}
static int orangefs_releasepage(struct page *page, gfp_t foo)
@@ -496,17 +497,17 @@ static void orangefs_freepage(struct page *page)
kfree(detach_page_private(page));
}
-static int orangefs_launder_page(struct page *page)
+static int orangefs_launder_folio(struct folio *folio)
{
int r = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
};
- wait_on_page_writeback(page);
- if (clear_page_dirty_for_io(page)) {
- r = orangefs_writepage_locked(page, &wbc);
- end_page_writeback(page);
+ folio_wait_writeback(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ r = orangefs_writepage_locked(&folio->page, &wbc);
+ folio_end_writeback(folio);
}
return r;
}
@@ -633,19 +634,19 @@ static const struct address_space_operations orangefs_address_operations = {
.readahead = orangefs_readahead,
.readpage = orangefs_readpage,
.writepages = orangefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
- .invalidatepage = orangefs_invalidatepage,
+ .invalidate_folio = orangefs_invalidate_folio,
.releasepage = orangefs_releasepage,
.freepage = orangefs_freepage,
- .launder_page = orangefs_launder_page,
+ .launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
unsigned long *bitlock = &orangefs_inode->bitlock;
@@ -659,27 +660,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
goto out;
}
- lock_page(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ folio_lock(folio);
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
- * Should be impossible. If it happens, launder the page
+ * Should be impossible. If it happens, launder the folio
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
}
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
+ if (folio_test_private(folio)) {
+ wr = folio_get_private(folio);
if (uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
goto okay;
} else {
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
@@ -690,27 +691,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
file_update_time(vmf->vma->vm_file);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
goto out;
}
/*
- * We mark the page dirty already here so that when freeze is in
+ * We mark the folio dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
- * see the dirty page and writeprotect it again.
+ * see the dirty folio and writeprotect it again.
*/
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 538e839590ef..b501dc07f922 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -176,7 +176,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
{
kfree(bufmap->page_array);
kfree(bufmap->desc_array);
- kfree(bufmap->buffer_index_array);
+ bitmap_free(bufmap->buffer_index_array);
kfree(bufmap);
}
@@ -226,8 +226,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
bufmap->desc_size = user_desc->size;
bufmap->desc_shift = ilog2(bufmap->desc_size);
- bufmap->buffer_index_array =
- kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
+ bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL);
if (!bufmap->buffer_index_array)
goto out_free_bufmap;
@@ -250,7 +249,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
out_free_desc_array:
kfree(bufmap->desc_array);
out_free_index_array:
- kfree(bufmap->buffer_index_array);
+ bitmap_free(bufmap->buffer_index_array);
out_free_bufmap:
kfree(bufmap);
out:
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 3627ea946402..de80b62553bb 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -894,10 +894,11 @@ static struct attribute *orangefs_default_attrs[] = {
&perf_time_interval_secs_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(orangefs_default);
static struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = orangefs_default_attrs,
+ .default_groups = orangefs_default_groups,
};
static struct orangefs_attribute acache_hard_limit_attribute =
@@ -931,10 +932,11 @@ static struct attribute *acache_orangefs_default_attrs[] = {
&acache_timeout_msecs_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(acache_orangefs_default);
static struct kobj_type acache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = acache_orangefs_default_attrs,
+ .default_groups = acache_orangefs_default_groups,
};
static struct orangefs_attribute capcache_hard_limit_attribute =
@@ -968,10 +970,11 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
&capcache_timeout_secs_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(capcache_orangefs_default);
static struct kobj_type capcache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = capcache_orangefs_default_attrs,
+ .default_groups = capcache_orangefs_default_groups,
};
static struct orangefs_attribute ccache_hard_limit_attribute =
@@ -1005,10 +1008,11 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
&ccache_timeout_secs_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(ccache_orangefs_default);
static struct kobj_type ccache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = ccache_orangefs_default_attrs,
+ .default_groups = ccache_orangefs_default_groups,
};
static struct orangefs_attribute ncache_hard_limit_attribute =
@@ -1042,10 +1046,11 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
&ncache_timeout_msecs_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(ncache_orangefs_default);
static struct kobj_type ncache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = ncache_orangefs_default_attrs,
+ .default_groups = ncache_orangefs_default_groups,
};
static struct orangefs_attribute pc_acache_attribute =
@@ -1072,10 +1077,11 @@ static struct attribute *pc_orangefs_default_attrs[] = {
&pc_ncache_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(pc_orangefs_default);
static struct kobj_type pc_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = pc_orangefs_default_attrs,
+ .default_groups = pc_orangefs_default_groups,
};
static struct orangefs_attribute stats_reads_attribute =
@@ -1095,10 +1101,11 @@ static struct attribute *stats_orangefs_default_attrs[] = {
&stats_writes_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(stats_orangefs_default);
static struct kobj_type stats_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .default_attrs = stats_orangefs_default_attrs,
+ .default_groups = stats_orangefs_default_groups,
};
static struct kobject *orangefs_obj;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 2f2e430461b2..5254256a224d 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/hashtable.h>
+#include <linux/seq_file.h>
/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
static struct kmem_cache *orangefs_inode_cache;
@@ -106,7 +107,7 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
{
struct orangefs_inode_s *orangefs_inode;
- orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+ orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL);
if (!orangefs_inode)
return NULL;
@@ -475,7 +476,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
const char *devname,
void *data)
{
- int ret = -EINVAL;
+ int ret;
struct super_block *sb = ERR_PTR(-EINVAL);
struct orangefs_kernel_op_s *new_op;
struct dentry *d = ERR_PTR(-EINVAL);
@@ -526,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
if (!ORANGEFS_SB(sb)) {
d = ERR_PTR(-ENOMEM);
- goto free_op;
+ goto free_sb_and_op;
}
ret = orangefs_fill_sb(sb,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 4e7d5bfa2949..e040970408d4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -140,12 +140,14 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
int err;
err = ovl_real_fileattr_get(old, &oldfa);
- if (err)
- return err;
-
- err = ovl_real_fileattr_get(new, &newfa);
- if (err)
+ if (err) {
+ /* Ntfs-3g returns -EINVAL for "no fileattr support" */
+ if (err == -ENOTTY || err == -EINVAL)
+ return 0;
+ pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
+ old->dentry, err);
return err;
+ }
/*
* We cannot set immutable and append-only flags on upper inode,
@@ -155,10 +157,31 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
*/
if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) {
err = ovl_set_protattr(inode, new->dentry, &oldfa);
- if (err)
+ if (err == -EPERM)
+ pr_warn_once("copying fileattr: no xattr on upper\n");
+ else if (err)
return err;
}
+ /* Don't bother copying flags if none are set */
+ if (!(oldfa.flags & OVL_COPY_FS_FLAGS_MASK))
+ return 0;
+
+ err = ovl_real_fileattr_get(new, &newfa);
+ if (err) {
+ /*
+ * Returning an error if upper doesn't support fileattr will
+ * result in a regression, so revert to the old behavior.
+ */
+ if (err == -ENOTTY || err == -EINVAL) {
+ pr_warn_once("copying fileattr: no support on upper\n");
+ return 0;
+ }
+ pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n",
+ new->dentry, err);
+ return err;
+ }
+
BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL);
newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK;
newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 93c7c267de93..f18490813170 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -137,8 +137,7 @@ kill_whiteout:
goto out;
}
-static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
- umode_t mode)
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index c88ac571593d..fa125feed0ff 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -17,6 +17,7 @@
struct ovl_aio_req {
struct kiocb iocb;
+ refcount_t ref;
struct kiocb *orig_iocb;
struct fd fd;
};
@@ -252,6 +253,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
return flags;
}
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
+{
+ if (refcount_dec_and_test(&aio_req->ref)) {
+ fdput(aio_req->fd);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ }
+}
+
static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
{
struct kiocb *iocb = &aio_req->iocb;
@@ -268,18 +277,17 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
}
orig_iocb->ki_pos = iocb->ki_pos;
- fdput(aio_req->fd);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ ovl_aio_put(aio_req);
}
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
{
struct ovl_aio_req *aio_req = container_of(iocb,
struct ovl_aio_req, iocb);
struct kiocb *orig_iocb = aio_req->orig_iocb;
ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res, res2);
+ orig_iocb->ki_complete(orig_iocb, res);
}
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -319,7 +327,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
@@ -390,7 +400,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 832b17589733..1f36158c7dbe 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,10 @@ int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
if (err)
return err;
- return vfs_fileattr_get(realpath->dentry, fa);
+ err = vfs_fileattr_get(realpath->dentry, fa);
+ if (err == -ENOIOCTLCMD)
+ err = -ENOTTY;
+ return err;
}
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3894f3347955..2cd5741c873b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -570,6 +570,7 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 178daa5e82c9..001cdbb8f015 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -174,7 +174,7 @@ static struct kmem_cache *ovl_inode_cachep;
static struct inode *ovl_alloc_inode(struct super_block *sb)
{
- struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL);
+ struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
@@ -787,10 +787,14 @@ retry:
goto retry;
}
- work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
- err = PTR_ERR(work);
- if (IS_ERR(work))
- goto out_err;
+ err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ if (err)
+ goto out_dput;
+
+ /* Weird filesystem returning with hashed negative (kernfs)? */
+ err = -EINVAL;
+ if (d_really_is_negative(work))
+ goto out_dput;
/*
* Try to remove POSIX ACL xattrs from workdir. We are good if:
@@ -869,7 +873,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("filesystem on '%s' not supported\n", name);
goto out_put;
}
- if (mnt_user_ns(path->mnt) != &init_user_ns) {
+ if (is_idmapped_mnt(path->mnt)) {
pr_err("idmapped layers are currently not supported\n");
goto out_put;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 6d4342bad9f1..e140ea150bbb 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -25,6 +25,7 @@
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
+#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -50,13 +51,13 @@
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
*/
-unsigned int pipe_max_size = 1048576;
+static unsigned int pipe_max_size = 1048576;
/* Maximum allocatable pages per user. Hard limit is unset by default, soft
* matches default values.
*/
-unsigned long pipe_user_pages_hard;
-unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+static unsigned long pipe_user_pages_hard;
+static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
/*
* We use head and tail indices that aren't masked off, except at the point of
@@ -252,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
*/
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) {
- unsigned int head = pipe->head;
+ /* Read ->head with a barrier vs post_one_notification() */
+ unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
@@ -605,7 +607,7 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, head, tail, mask;
+ unsigned int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
@@ -827,13 +829,11 @@ out_free_uid:
void free_pipe_info(struct pipe_inode_info *pipe)
{
- int i;
+ unsigned int i;
#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe->watch_queue)
watch_queue_clear(pipe->watch_queue);
- put_watch_queue(pipe->watch_queue);
- }
#endif
(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
@@ -843,6 +843,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (buf->ops)
pipe_buf_release(pipe, buf);
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ put_watch_queue(pipe->watch_queue);
+#endif
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
kfree(pipe->bufs);
@@ -1428,6 +1432,60 @@ static struct file_system_type pipe_fs_type = {
.kill_sb = kill_anon_super,
};
+#ifdef CONFIG_SYSCTL
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned int val;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, NULL);
+}
+
+static struct ctl_table fs_pipe_sysctls[] = {
+ {
+ .procname = "pipe-max-size",
+ .data = &pipe_max_size,
+ .maxlen = sizeof(pipe_max_size),
+ .mode = 0644,
+ .proc_handler = proc_dopipe_max_size,
+ },
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ { }
+};
+#endif
+
static int __init init_pipe_fs(void)
{
int err = register_filesystem(&pipe_fs_type);
@@ -1439,6 +1497,9 @@ static int __init init_pipe_fs(void)
unregister_filesystem(&pipe_fs_type);
}
}
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("fs", fs_pipe_sysctls);
+#endif
return err;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f5c25f580dd9..962d32468eb4 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -23,6 +23,7 @@
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
+#include <linux/mnt_idmapping.h>
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -134,8 +135,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* to just call ->get_acl to fetch the ACL ourself. (This is going to
* be an unlikely race.)
*/
- if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
- /* fall through */ ;
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
* Normally, the ACL returned by ->get_acl will be cached.
@@ -375,7 +375,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
goto check_perm;
break;
case ACL_USER:
- uid = kuid_into_mnt(mnt_userns, pa->e_uid);
+ uid = mapped_kuid_fs(mnt_userns,
+ i_user_ns(inode),
+ pa->e_uid);
if (uid_eq(uid, current_fsuid()))
goto mask;
break;
@@ -388,7 +390,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
}
break;
case ACL_GROUP:
- gid = kgid_into_mnt(mnt_userns, pa->e_gid);
+ gid = mapped_kgid_fs(mnt_userns,
+ i_user_ns(inode),
+ pa->e_gid);
if (in_group_p(gid)) {
found = 1;
if ((pa->e_perm & want) == want)
@@ -735,17 +739,17 @@ static void posix_acl_fix_xattr_userns(
case ACL_USER:
uid = make_kuid(from, le32_to_cpu(entry->e_id));
if (from_user)
- uid = kuid_from_mnt(mnt_userns, uid);
+ uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid);
else
- uid = kuid_into_mnt(mnt_userns, uid);
+ uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid);
entry->e_id = cpu_to_le32(from_kuid(to, uid));
break;
case ACL_GROUP:
gid = make_kgid(from, le32_to_cpu(entry->e_id));
if (from_user)
- gid = kgid_from_mnt(mnt_userns, gid);
+ gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid);
else
- gid = kgid_into_mnt(mnt_userns, gid);
+ gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid);
entry->e_id = cpu_to_le32(from_kgid(to, gid));
break;
default:
@@ -755,9 +759,14 @@ static void posix_acl_fix_xattr_userns(
}
void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
@@ -765,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
}
void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49be8c8ef555..eb815759842c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -88,10 +88,10 @@
#include <linux/pid_namespace.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/kthread.h>
#include <asm/processor.h>
#include "internal.h"
@@ -102,6 +102,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else if (p->flags & PF_KTHREAD)
+ get_kthread_comm(tcomm, sizeof(tcomm), p);
else
__get_task_comm(tcomm, sizeof(tcomm), p);
@@ -408,9 +410,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_mask));
}
-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
- seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}
@@ -436,7 +438,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
- task_core_dumping(m, mm);
+ task_core_dumping(m, task);
task_thp_status(m, mm);
mmput(mm);
}
@@ -468,6 +470,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cgtime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
+ int exit_code = task->exit_code;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -531,6 +534,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
maj_flt += sig->maj_flt;
thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
+
+ if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
+ exit_code = sig->group_exit_code;
}
sid = task_session_nr_ns(task, ns);
@@ -541,7 +547,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
if (permitted && (!whole || num_threads < 2))
- wchan = get_wchan(task);
+ wchan = !task_is_running(task);
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
@@ -606,10 +612,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
+ seq_put_decimal_ull(m, " ", wchan);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", 0);
@@ -633,7 +636,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, " ", task->exit_code);
+ seq_put_decimal_ll(m, " ", exit_code);
else
seq_puts(m, " 0");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 533d5836eb9a..c1031843cc6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,13 +67,13 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
@@ -386,17 +386,19 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
+ char symname[KSYM_NAME_LEN];
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- wchan = get_wchan(task);
- else
- wchan = 0;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
- if (wchan)
- seq_printf(m, "%ps", (void *) wchan);
- else
- seq_putc(m, '0');
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+print0:
+ seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -667,10 +669,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/************************************************************************/
/* permission checks */
-static int proc_fd_access_allowed(struct inode *inode)
+static bool proc_fd_access_allowed(struct inode *inode)
{
struct task_struct *task;
- int allowed = 0;
+ bool allowed = false;
/* Allow access to a task's file descriptors if it is us or we
* may use ptrace attach to the process and find out that
* information.
@@ -1761,25 +1763,25 @@ out:
static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
@@ -1979,19 +1981,21 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
pid_update_inode(task, inode);
- put_task_struct(task);
- return 1;
+ ret = 1;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static inline bool proc_inode_is_dead(struct inode *inode)
@@ -3799,7 +3803,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
unsigned int len;
+
tid = task_pid_nr_ns(task, ns);
+ if (!tid)
+ continue; /* The task has just exited. */
len = snprintf(name, sizeof(name), "%u", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 6d8d4bf20837..2e244ada1f97 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -32,6 +32,8 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
xbc_for_each_key_value(leaf, val) {
ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5b78739e60e4..f2132407e133 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -791,12 +791,6 @@ void proc_remove(struct proc_dir_entry *de)
}
EXPORT_SYMBOL(proc_remove);
-void *PDE_DATA(const struct inode *inode)
-{
- return __PDE_DATA(inode);
-}
-EXPORT_SYMBOL(PDE_DATA);
-
/*
* Pull a user buffer into memory and pass it to the file's write handler if
* one is supplied. The ->write() method is permitted to modify the
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 599eb724ff2d..73aeb4e6d32e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -66,7 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
@@ -650,6 +650,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return NULL;
}
+ inode->i_private = de->data;
inode->i_ino = de->low_ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
PROC_I(inode)->pde = de;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 03415f3fb3a8..06a80f78433d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -115,11 +115,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
-static inline void *__PDE_DATA(const struct inode *inode)
-{
- return PDE(inode)->data;
-}
-
static inline struct pid *proc_pid(const struct inode *inode)
{
return PROC_I(inode)->pid;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde..a2873a617ae8 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 15c2e55d2ed2..e1cfeda397f3 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -61,15 +61,27 @@ static int seq_open_net(struct inode *inode, struct file *file)
}
#ifdef CONFIG_NET_NS
p->net = net;
+ netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
#endif
return 0;
}
+static void seq_file_net_put_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *priv = seq->private;
+
+ put_net_track(priv->net, &priv->ns_tracker);
+#else
+ put_net(&init_net);
+#endif
+}
+
static int seq_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
- put_net(seq_file_net(seq));
+ seq_file_net_put_net(seq);
seq_release_private(ino, f);
return 0;
}
@@ -87,7 +99,8 @@ int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- p->net = get_net(current->nsproxy->net_ns);
+ p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
+ GFP_KERNEL);
#endif
return 0;
}
@@ -97,7 +110,7 @@ void bpf_iter_fini_seq_net(void *priv_data)
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- put_net(p->net);
+ put_net_track(p->net, &p->ns_tracker);
#endif
}
@@ -125,7 +138,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
* @write: The write method with which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @data: Data for retrieval by pde_data().
*
* Create a network namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -140,7 +153,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
@@ -217,7 +230,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* @parent: The parent directory in which to create.
* @show: The seqfile show method with which to read the file.
* @write: The write method with which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @data: Data for retrieval by pde_data().
*
* Create a network-namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -232,7 +245,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_single_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5d66faecd4ef..7d9cfc730bd4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
+#include <linux/kmemleak.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -25,15 +26,32 @@ static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, INT_MAX };
+const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 };
EXPORT_SYMBOL(sysctl_vals);
+const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
+EXPORT_SYMBOL_GPL(sysctl_long_vals);
+
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
{ }
};
+/**
+ * register_sysctl_mount_point() - registers a sysctl mount point
+ * @path: path for the mount point
+ *
+ * Used to create a permanently empty directory to serve as mount point.
+ * There are some subtle but important permission checks this allows in the
+ * case of unprivileged mounts.
+ */
+struct ctl_table_header *register_sysctl_mount_point(const char *path)
+{
+ return register_sysctl(path, sysctl_mount_point);
+}
+EXPORT_SYMBOL(register_sysctl_mount_point);
+
static bool is_empty_dir(struct ctl_table_header *head)
{
return head->ctl_table[0].child == sysctl_mount_point;
@@ -163,7 +181,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
else {
pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
- pr_cont("/%s\n", entry->procname);
+ pr_cont("%s\n", entry->procname);
return -EEXIST;
}
}
@@ -1020,8 +1038,8 @@ failed:
if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
- pr_cont("/%*.*s %ld\n",
- namelen, namelen, name, PTR_ERR(subdir));
+ pr_cont("%*.*s %ld\n", namelen, namelen, name,
+ PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
if (new)
@@ -1053,7 +1071,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
struct ctl_dir *dir;
int ret;
- ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
set = lookup_header_set(root);
@@ -1384,6 +1401,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab
}
EXPORT_SYMBOL(register_sysctl);
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base
+ * @table: This is the sysctl table that needs to be registered to the path
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: Can only be called after your respective sysctl base path has been
+ * registered. So for instance, most base directories are registered early on
+ * init before init levels are processed through proc_sys_init() and
+ * sysctl_init_bases().
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+ const char *table_name)
+{
+ struct ctl_table_header *hdr = register_sysctl(path, table);
+
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ return;
+ }
+ kmemleak_not_leak(hdr);
+}
+
static char *append_path(const char *path, char *pos, const char *name)
{
int namelen;
@@ -1597,6 +1646,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
}
EXPORT_SYMBOL(register_sysctl_table);
+int __register_sysctl_base(struct ctl_table *base_table)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_table(base_table);
+ kmemleak_not_leak(hdr);
+ return 0;
+}
+
static void put_links(struct ctl_table_header *header)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
@@ -1626,7 +1684,7 @@ static void put_links(struct ctl_table_header *header)
else {
pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
- pr_cont("/%s\n", name);
+ pr_cont("%s\n", name);
}
}
}
@@ -1710,7 +1768,7 @@ int __init proc_sys_init(void)
proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return sysctl_init();
+ return sysctl_init_bases();
}
struct sysctl_alias {
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6561a06ef905..4fb8729a68d4 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -24,7 +24,7 @@
#ifdef arch_idle_time
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
@@ -46,7 +46,7 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
#else
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cf25be3e0321..f46060eb91b5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
+ struct anon_vma_name *anon_name;
+
if (!mm) {
name = "[vdso]";
goto done;
@@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (is_stack(vma))
+ if (is_stack(vma)) {
name = "[stack]";
+ goto done;
+ }
+
+ anon_name = anon_vma_name(vma);
+ if (anon_name) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[anon:%s]", anon_name->name);
+ }
}
done:
@@ -397,7 +408,6 @@ struct mem_size_stats {
u64 pss_shmem;
u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
@@ -430,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -457,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* page_count(page) == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -478,9 +496,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
@@ -488,6 +508,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
#define smaps_pte_hole NULL
#endif /* CONFIG_SHMEM */
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -495,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
@@ -514,21 +545,21 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent))
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
page = pfn_swap_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = xa_load(&vma->vm_file->f_mapping->i_pages,
- linear_page_index(vma, addr));
- if (xa_is_value(page))
- mss->swap += PAGE_SIZE;
+ }
+ } else {
+ smaps_pte_hole_lookup(addr, walk);
return;
}
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+ locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -539,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pmd_present(*pmd)) {
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -546,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = pfn_swap_entry_to_page(entry);
+ }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -559,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -735,8 +771,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
return;
#ifdef CONFIG_SHMEM
- /* In case of smaps_rollup, reset the value from previous vma */
- mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -754,7 +788,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- mss->check_shmem_swap = true;
ops = &smaps_shmem_walk_ops;
}
}
@@ -1363,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ bool migration = false;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1384,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
+ migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1406,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
u64 flags = 0, frame = 0;
@@ -1446,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
@@ -1560,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bits 58-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..deb99bc9b7e6 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -12,18 +12,22 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9a15334da208..6f1b8ddc6f7a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -26,7 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -62,46 +62,71 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+void register_vmcore_cb(struct vmcore_cb *cb)
{
- if (oldmem_pfn_is_ram)
- return -EBUSY;
- oldmem_pfn_is_ram = fn;
- return 0;
+ INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ spin_unlock(&vmcore_cb_lock);
}
-EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
-void unregister_oldmem_pfn_is_ram(void)
+void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- oldmem_pfn_is_ram = NULL;
- wmb();
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
-EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
{
- int (*fn)(unsigned long pfn);
- /* pfn is ram unless fn() checks pagetype */
- int ret = 1;
+ struct vmcore_cb *cb;
+ bool ret = true;
- /*
- * Ask hypervisor if the pfn is really ram.
- * A ballooned page contains no data and reading from such a page
- * will cause high load in the hypervisor.
- */
- fn = oldmem_pfn_is_ram;
- if (fn)
- ret = fn(pfn);
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
return ret;
}
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ spin_lock(&vmcore_cb_lock);
+ vmcore_opened = true;
+ spin_unlock(&vmcore_cb_lock);
+
+ return 0;
+}
+
/* Reads a page from the oldmem device from given offset. */
ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf,
@@ -110,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -117,6 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -124,9 +151,13 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
- memset(buf, 0, nr_bytes);
- else {
+ if (!pfn_is_ram(pfn)) {
+ tmp = 0;
+ if (!userbuf)
+ memset(buf, 0, nr_bytes);
+ else if (clear_user(buf, nr_bytes))
+ tmp = -EFAULT;
+ } else {
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
@@ -135,10 +166,12 @@ ssize_t read_from_oldmem(char *buf, size_t count,
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
-
- if (tmp < 0)
- return tmp;
}
+ if (tmp < 0) {
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return tmp;
+ }
+
*ppos += nr_bytes;
count -= nr_bytes;
buf += nr_bytes;
@@ -146,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return read;
}
@@ -177,7 +211,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
+ return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -378,7 +412,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, mem_encrypt_active());
+ userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
buflen -= tsz;
@@ -446,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
+ * @size: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -537,14 +571,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ int ret, idx;
+
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- if (oldmem_pfn_is_ram)
- return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ if (!list_empty(&vmcore_cb_list))
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
- return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return ret;
}
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
@@ -668,6 +707,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
#endif
static const struct proc_ops vmcore_proc_ops = {
+ .proc_open = open_vmcore,
.proc_read = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 392ef5162655..49650e54d2f8 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -80,7 +80,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, fs_infop->str);
}
- if (mnt_user_ns(mnt) != &init_user_ns)
+ if (is_idmapped_mnt(mnt))
seq_puts(m, ",idmapped");
}
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 328da35da390..8adabde685f1 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -173,7 +173,6 @@ config PSTORE_BLK
tristate "Log panic/oops to a block device"
depends on PSTORE
depends on BLOCK
- depends on BROKEN
select PSTORE_ZONE
default n
help
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 04ce58c939a0..4ae0cfcd15f2 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
static int __register_pstore_blk(struct pstore_device_info *dev,
const char *devpath)
{
- struct inode *inode;
int ret = -ENODEV;
lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
goto err;
}
- inode = file_inode(psblk_file);
- if (!S_ISBLK(inode->i_mode)) {
+ if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
pr_err("'%s' is not block device!\n", devpath);
goto err_fput;
}
- inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
- dev->zone.total_size = i_size_read(inode);
+ dev->zone.total_size =
+ bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
ret = __register_pstore_device(dev);
if (ret)
@@ -311,7 +309,7 @@ static int __init __best_effort_init(void)
if (ret)
kfree(best_effort_dev);
else
- pr_info("attached %s (%zu) (no dedicated panic_write!)\n",
+ pr_info("attached %s (%lu) (no dedicated panic_write!)\n",
blkdev, best_effort_dev->zone.total_size);
return ret;
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 5939595f0115..776cae20af4e 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -64,20 +64,12 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
static DEFINE_MUTEX(pstore_ftrace_lock);
static bool pstore_ftrace_enabled;
-static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
- size_t count, loff_t *ppos)
+static int pstore_set_ftrace_enabled(bool on)
{
- u8 on;
ssize_t ret;
- ret = kstrtou8_from_user(buf, count, 2, &on);
- if (ret)
- return ret;
-
- mutex_lock(&pstore_ftrace_lock);
-
- if (!on ^ pstore_ftrace_enabled)
- goto out;
+ if (on == pstore_ftrace_enabled)
+ return 0;
if (on) {
ftrace_ops_set_global_filter(&pstore_ftrace_ops);
@@ -89,15 +81,30 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
if (ret) {
pr_err("%s: unable to %sregister ftrace ops: %zd\n",
__func__, on ? "" : "un", ret);
- goto err;
+ } else {
+ pstore_ftrace_enabled = on;
}
- pstore_ftrace_enabled = on;
-out:
- ret = count;
-err:
+ return ret;
+}
+
+static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u8 on;
+ ssize_t ret;
+
+ ret = kstrtou8_from_user(buf, count, 2, &on);
+ if (ret)
+ return ret;
+
+ mutex_lock(&pstore_ftrace_lock);
+ ret = pstore_set_ftrace_enabled(on);
mutex_unlock(&pstore_ftrace_lock);
+ if (ret == 0)
+ ret = count;
+
return ret;
}
@@ -117,6 +124,11 @@ static const struct file_operations pstore_knob_fops = {
static struct dentry *pstore_ftrace_dir;
+static bool record_ftrace;
+module_param(record_ftrace, bool, 0400);
+MODULE_PARM_DESC(record_ftrace,
+ "enable ftrace recording immediately (default: off)");
+
void pstore_register_ftrace(void)
{
if (!psinfo->write)
@@ -124,6 +136,8 @@ void pstore_register_ftrace(void)
pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
+ pstore_set_ftrace_enabled(record_ftrace);
+
debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, NULL,
&pstore_knob_fops);
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b9614db48b1d..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -218,7 +219,7 @@ static int zbufsize_842(size_t size)
#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
static int zbufsize_zstd(size_t size)
{
- return ZSTD_compressBound(size);
+ return zstd_compress_bound(size);
}
#endif
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index fe5305028c6e..a89e33719fcf 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
if (prz->corrected_bytes || prz->bad_blocks)
ret = snprintf(str, len, ""
- "\n%d Corrected bytes, %d unrecoverable blocks\n",
+ "\nECC: %d Corrected bytes, %d unrecoverable blocks\n",
prz->corrected_bytes, prz->bad_blocks);
else
- ret = snprintf(str, len, "\nNo errors detected\n");
+ ret = snprintf(str, len, "\nECC: No errors detected\n");
return ret;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3fb7fc819b4f..a635bb6615e9 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -338,7 +338,7 @@ static struct kmem_cache *qnx4_inode_cachep;
static struct inode *qnx4_alloc_inode(struct super_block *sb)
{
struct qnx4_inode_info *ei;
- ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 61191f7bdf62..9d8e7e9788a1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -597,7 +597,7 @@ static struct kmem_cache *qnx6_inode_cachep;
static struct inode *qnx6_alloc_inode(struct super_block *sb)
{
struct qnx6_inode_info *ei;
- ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 22d904bde6ab..a74aef99bd3d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type)
/* This is not very clever (and fast) but currently I don't know about
* any other simple way of getting quota data to disk and we must get
* them there for userspace to be visible... */
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
- sync_blockdev(sb->s_bdev);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
+ ret = sync_blockdev(sb->s_bdev);
+ if (ret)
+ return ret;
/*
* Now when everything is written we can discard the pagecache so
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2bcc9a6f1bfc..052f143e2e0e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
+#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index d3e995e1046f..5f2405994280 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
quota_error(dquot->dq_sb, "Quota structure has offset to "
"other block (%u) than it should (%u)", blk,
(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ ret = -EIO;
goto out_buf;
}
ret = read_blk(info, blk, buf);
@@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ newblk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
newblk = 0;
@@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
+ if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ blk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
else
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 65e7e56005b8..bc66d0173e33 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
#include "internal.h"
struct ramfs_mount_opts {
@@ -203,17 +204,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
- if (opt < 0) {
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
/*
* We might like to report bad mount options here;
* but traditionally ramfs has ignored all mount options,
* and as it is used as a !CONFIG_SHMEM simple substitute
* for tmpfs, better continue to ignore other mount options.
*/
- if (opt == -ENOPARAM)
- opt = 0;
- return opt;
+ return 0;
}
+ if (opt < 0)
+ return opt;
switch (opt) {
case Opt_mode:
diff --git a/fs/read_write.c b/fs/read_write.c
index af057c57bdc6..e643aec2b0ef 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely((ssize_t) count < 0))
return -EINVAL;
- /*
- * ranged mandatory locking does not apply to streams - it makes sense
- * only for files where position has a meaning.
- */
if (ppos) {
loff_t pos = *ppos;
@@ -389,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
return security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
}
+EXPORT_SYMBOL(rw_verify_area);
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
@@ -1621,35 +1618,41 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
return 0;
}
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+/* Like generic_write_checks(), but takes size of write instead of iter. */
+int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
if (IS_SWAPFILE(inode))
return -ETXTBSY;
- if (!iov_iter_count(from))
+ if (!*count)
return 0;
- /* FIXME: this is for backwards compatibility with 2.4 */
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+EXPORT_SYMBOL(generic_write_checks_count);
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ loff_t count = iov_iter_count(from);
+ int ret;
+
+ ret = generic_write_checks_count(iocb, &count);
if (ret)
return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 8fd54ed8f844..33c8b0dd07a2 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,10 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
- tristate "Reiserfs support"
+ tristate "Reiserfs support (deprecated)"
select CRC32
help
- Stores not just filenames but the files themselves in a balanced
- tree. Uses journalling.
+ Reiserfs is deprecated and scheduled to be removed from the kernel
+ in 2025. If you are still using it, please migrate to another
+ filesystem or tell us your usecase for reiserfs.
+
+ Reiserfs stores not just filenames but the files themselves in a
+ balanced tree. Uses journalling.
Balanced trees are more efficient than traditional file system
architectural foundations.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f49b72ccac4c..36c59b25486c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2763,13 +2763,6 @@ static int reiserfs_write_begin(struct file *file,
int old_ref = 0;
inode = mapping->host;
- *fsdata = NULL;
- if (flags & AOP_FLAG_CONT_EXPAND &&
- (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
- pos ++;
- *fsdata = (void *)(unsigned long)flags;
- }
-
index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -2896,9 +2889,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
unsigned start;
bool locked = false;
- if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
- pos ++;
-
reiserfs_wait_on_write_block(inode->i_sb);
if (reiserfs_transaction_running(inode->i_sb))
th = current->journal_info;
@@ -3094,7 +3084,7 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
* decide if this buffer needs to stay around for data logging or ordered
* write purposes
*/
-static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
{
int ret = 1;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
@@ -3147,26 +3137,26 @@ free_jh:
return ret;
}
-/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+/* clm -- taken from fs/buffer.c:block_invalidate_folio */
+static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct buffer_head *head, *bh, *next;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int curr_off = 0;
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int ret = 1;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
+ folio_clear_checked(folio);
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
@@ -3179,7 +3169,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* is this block fully invalidated?
*/
if (offset <= curr_off) {
- if (invalidatepage_can_drop(inode, bh))
+ if (invalidate_folio_can_drop(inode, bh))
reiserfs_unmap_buffer(bh);
else
ret = 0;
@@ -3194,21 +3184,21 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* so real IO is not possible anymore.
*/
if (!partial_page && ret) {
- ret = try_to_release_page(page, 0);
+ ret = filemap_release_folio(folio, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
out:
return;
}
-static int reiserfs_set_page_dirty(struct page *page)
+static bool reiserfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- if (reiserfs_file_data_log(inode)) {
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ if (reiserfs_file_data_log(mapping->host)) {
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
- return __set_page_dirty_buffers(page);
+ return block_dirty_folio(mapping, folio);
}
/*
@@ -3316,7 +3306,11 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* fill in hole pointers in the expanding truncate case. */
if (attr->ia_size > inode->i_size) {
- error = generic_cont_expand_simple(inode, attr->ia_size);
+ loff_t pos = attr->ia_size;
+
+ if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
+ pos++;
+ error = generic_cont_expand_simple(inode, pos);
if (REISERFS_I(inode)->i_prealloc_count > 0) {
int err;
struct reiserfs_transaction_handle th;
@@ -3430,10 +3424,10 @@ const struct address_space_operations reiserfs_address_space_operations = {
.readpage = reiserfs_readpage,
.readahead = reiserfs_readahead,
.releasepage = reiserfs_releasepage,
- .invalidatepage = reiserfs_invalidatepage,
+ .invalidate_folio = reiserfs_invalidate_folio,
.write_begin = reiserfs_write_begin,
.write_end = reiserfs_write_end,
.bmap = reiserfs_aop_bmap,
.direct_IO = reiserfs_direct_IO,
- .set_page_dirty = reiserfs_set_page_dirty,
+ .dirty_folio = reiserfs_dirty_folio,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 0834b101c316..b5b6f6201bed 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -858,8 +858,8 @@ loop_next:
ret = -EIO;
}
/*
- * ugly interaction with invalidatepage here.
- * reiserfs_invalidate_page will pin any buffer that has a
+ * ugly interaction with invalidate_folio here.
+ * reiserfs_invalidate_folio will pin any buffer that has a
* valid journal head from an older transaction. If someone
* else sets our buffer dirty after we write it in the first
* loop, and then someone truncates the page away, nobody
@@ -951,7 +951,9 @@ static int reiserfs_async_progress_wait(struct super_block *s)
int depth;
depth = reiserfs_write_unlock_nested(s);
- congestion_wait(BLK_RW_ASYNC, HZ / 10);
+ wait_var_event_timeout(&j->j_async_throttle,
+ atomic_read(&j->j_async_throttle) == 0,
+ HZ / 10);
reiserfs_write_lock_nested(s, depth);
}
@@ -1058,7 +1060,8 @@ static int flush_commit_list(struct super_block *s,
put_bh(tbh) ;
}
}
- atomic_dec(&journal->j_async_throttle);
+ if (atomic_dec_and_test(&journal->j_async_throttle))
+ wake_up_var(&journal->j_async_throttle);
for (i = 0; i < (jl->j_len + 1); i++) {
bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 58481f8d63d5..cfb7c44c7366 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -639,7 +639,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
static struct inode *reiserfs_alloc_inode(struct super_block *sb)
{
struct reiserfs_inode_info *ei;
- ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
atomic_set(&ei->openers, 0);
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
if (!strcmp(arg, "auto")) {
/* From JFS code, to auto-get the size. */
- *blocks =
- i_size_read(s->s_bdev->bd_inode) >> s->
- s_blocksize_bits;
+ *blocks = sb_bdev_nr_blocks(s);
} else {
*blocks = simple_strtoul(arg, &p, 0);
if (*p != '\0') {
@@ -1437,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
- char *new_opts;
int err;
char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
@@ -1445,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
- new_opts = kstrdup(arg, GFP_KERNEL);
- if (arg && !new_opts)
- return -ENOMEM;
-
sync_filesystem(s);
reiserfs_write_lock(s);
@@ -1599,7 +1592,6 @@ out_ok_unlocked:
out_err_unlock:
reiserfs_write_unlock(s);
out_err:
- kfree(new_opts);
return err;
}
@@ -1660,6 +1652,8 @@ static int read_super_block(struct super_block *s, int offset)
return 1;
}
+ reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
+ "scheduled to be removed from the kernel in 2025");
SB_BUFFER_WITH_SB(s) = bh;
SB_DISK_SUPER_BLOCK(s) = rs;
@@ -1986,9 +1980,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
* smaller than the filesystem. If the check fails then abort and
* scream, because bad stuff will happen otherwise.
*/
- if (s->s_bdev && s->s_bdev->bd_inode
- && i_size_read(s->s_bdev->bd_inode) <
- sb_block_count(rs) * sb_blocksize(rs)) {
+ if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
SWARN(silent, s, "", "Filesystem cannot be "
"mounted because it is bigger than the device");
SWARN(silent, s, "", "You may need to run fsck "
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 6d4a9beaa097..e112b5424cdb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -146,119 +146,113 @@ static int generic_remap_check_len(struct inode *inode_in,
}
/* Read a page's worth of file data into the page cache. */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
- struct page *page;
+ struct folio *folio;
- page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
+ folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
+ if (IS_ERR(folio))
+ return folio;
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-EIO);
}
- return page;
+ return folio;
}
/*
- * Lock two pages, ensuring that we lock in offset order if the pages are from
- * the same file.
+ * Lock two folios, ensuring that we lock in offset order if the folios
+ * are from the same file.
*/
-static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2)
{
/* Always lock in order of increasing index. */
- if (page1->index > page2->index)
- swap(page1, page2);
+ if (folio1->index > folio2->index)
+ swap(folio1, folio2);
- lock_page(page1);
- if (page1 != page2)
- lock_page(page2);
+ folio_lock(folio1);
+ if (folio1 != folio2)
+ folio_lock(folio2);
}
-/* Unlock two pages, being careful not to unlock the same page twice. */
-static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+/* Unlock two folios, being careful not to unlock the same folio twice. */
+static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
{
- unlock_page(page1);
- if (page1 != page2)
- unlock_page(page2);
+ folio_unlock(folio1);
+ if (folio1 != folio2)
+ folio_unlock(folio2);
}
/*
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
+static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
+ struct file *dest, loff_t dstoff,
loff_t len, bool *is_same)
{
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
+ bool same = true;
+ int error = -EINVAL;
+
while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
+ struct folio *src_folio, *dst_folio;
+ void *src_addr, *dst_addr;
+ loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
+ PAGE_SIZE - offset_in_page(dstoff));
+
cmp_len = min(cmp_len, len);
if (cmp_len <= 0)
goto out_error;
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
+ src_folio = vfs_dedupe_get_folio(src, srcoff);
+ if (IS_ERR(src_folio)) {
+ error = PTR_ERR(src_folio);
goto out_error;
}
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- put_page(src_page);
+ dst_folio = vfs_dedupe_get_folio(dest, dstoff);
+ if (IS_ERR(dst_folio)) {
+ error = PTR_ERR(dst_folio);
+ folio_put(src_folio);
goto out_error;
}
- vfs_lock_two_pages(src_page, dest_page);
+ vfs_lock_two_folios(src_folio, dst_folio);
/*
- * Now that we've locked both pages, make sure they're still
+ * Now that we've locked both folios, make sure they're still
* mapped to the file data we're interested in. If not,
* someone is invalidating pages on us and we lose.
*/
- if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
- src_page->mapping != src->i_mapping ||
- dest_page->mapping != dest->i_mapping) {
+ if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
+ src_folio->mapping != src->f_mapping ||
+ dst_folio->mapping != dest->f_mapping) {
same = false;
goto unlock;
}
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
+ src_addr = kmap_local_folio(src_folio,
+ offset_in_folio(src_folio, srcoff));
+ dst_addr = kmap_local_folio(dst_folio,
+ offset_in_folio(dst_folio, dstoff));
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
+ flush_dcache_folio(src_folio);
+ flush_dcache_folio(dst_folio);
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ if (memcmp(src_addr, dst_addr, cmp_len))
same = false;
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
+ kunmap_local(dst_addr);
+ kunmap_local(src_addr);
unlock:
- vfs_unlock_two_pages(src_page, dest_page);
- put_page(dest_page);
- put_page(src_page);
+ vfs_unlock_two_folios(src_folio, dst_folio);
+ folio_put(dst_folio);
+ folio_put(src_folio);
if (!same)
break;
srcoff += cmp_len;
- destoff += cmp_len;
+ dstoff += cmp_len;
len -= cmp_len;
}
@@ -339,8 +333,8 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+ file_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
@@ -368,11 +362,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
@@ -464,7 +453,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
+ if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
goto out_drop_write;
ret = -EISDIR;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 259f684d9236..9e6bbb4219de 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -375,7 +375,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
{
struct romfs_inode_info *inode;
- inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL);
return inode ? &inode->vfs_inode : NULL;
}
diff --git a/fs/select.c b/fs/select.c
index 945896d0ac9e..0ee55af1a55c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
* of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
*/
+#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
@@ -458,9 +459,11 @@ get_max:
return max;
}
-#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)
-#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR)
-#define POLLEX_SET (EPOLLPRI)
+#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
+ EPOLLNVAL)
+#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
+ EPOLLNVAL)
+#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
static inline void wait_key_set(poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit,
@@ -527,6 +530,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
break;
if (!(bit & all_bits))
continue;
+ mask = EPOLLNVAL;
f = fdget(i);
if (f.file) {
wait_key_set(wait, in, out, bit,
@@ -534,34 +538,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
mask = vfs_poll(f.file, wait);
fdput(f);
- if ((mask & POLLIN_SET) && (in & bit)) {
- res_in |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- if ((mask & POLLOUT_SET) && (out & bit)) {
- res_out |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- if ((mask & POLLEX_SET) && (ex & bit)) {
- res_ex |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- /* got something, stop busy polling */
- if (retval) {
- can_busy_loop = false;
- busy_flag = 0;
-
- /*
- * only remember a returned
- * POLL_BUSY_LOOP if we asked for it
- */
- } else if (busy_flag & mask)
- can_busy_loop = true;
-
}
+ if ((mask & POLLIN_SET) && (in & bit)) {
+ res_in |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ if ((mask & POLLOUT_SET) && (out & bit)) {
+ res_out |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ if ((mask & POLLEX_SET) && (ex & bit)) {
+ res_ex |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ /* got something, stop busy polling */
+ if (retval) {
+ can_busy_loop = false;
+ busy_flag = 0;
+
+ /*
+ * only remember a returned
+ * POLL_BUSY_LOOP if we asked for it
+ */
+ } else if (busy_flag & mask)
+ can_busy_loop = true;
+
}
if (res_in)
*rinp = res_in;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4a2cda04d3e2..7ab8a58c29b6 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -383,22 +383,6 @@ void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
}
EXPORT_SYMBOL(seq_escape_mem);
-/**
- * seq_escape - print string into buffer, escaping some characters
- * @m: target buffer
- * @s: string
- * @esc: set of characters that need escaping
- *
- * Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape.
- * Use seq_has_overflowed() to check for errors.
- */
-void seq_escape(struct seq_file *m, const char *s, const char *esc)
-{
- seq_escape_str(m, s, ESCAPE_OCTAL, esc);
-}
-EXPORT_SYMBOL(seq_escape);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
@@ -570,9 +554,9 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
}
EXPORT_SYMBOL(seq_dentry);
-static void *single_start(struct seq_file *p, loff_t *pos)
+void *single_start(struct seq_file *p, loff_t *pos)
{
- return NULL + (*pos == 0);
+ return *pos ? NULL : SEQ_START_TOKEN;
}
static void *single_next(struct seq_file *p, void *v, loff_t *pos)
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 040e1cf90528..e20d1484c663 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -35,17 +35,7 @@
void signalfd_cleanup(struct sighand_struct *sighand)
{
- wait_queue_head_t *wqh = &sighand->signalfd_wqh;
- /*
- * The lockless check can race with remove_wait_queue() in progress,
- * but in this case its caller should run under rcu_read_lock() and
- * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
- */
- if (likely(!waitqueue_active(wqh)))
- return;
-
- /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
- wake_up_poll(wqh, EPOLLHUP | POLLFREE);
+ wake_up_pollfree(&sighand->signalfd_wqh);
}
struct signalfd_ctx {
@@ -165,11 +155,12 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info,
int nonblock)
{
+ enum pid_type type;
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
spin_lock_irq(&current->sighand->siglock);
- ret = dequeue_signal(current, &ctx->sigmask, info);
+ ret = dequeue_signal(current, &ctx->sigmask, info, &type);
switch (ret) {
case 0:
if (!nonblock)
@@ -184,7 +175,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
add_wait_queue(&current->sighand->signalfd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- ret = dequeue_signal(current, &ctx->sigmask, info);
+ ret = dequeue_signal(current, &ctx->sigmask, info, &type);
if (ret != 0)
break;
if (signal_pending(current)) {
diff --git a/fs/smbfs_common/cifs_arc4.c b/fs/smbfs_common/cifs_arc4.c
index 85ba15a60b13..043e4cb839fa 100644
--- a/fs/smbfs_common/cifs_arc4.c
+++ b/fs/smbfs_common/cifs_arc4.c
@@ -72,16 +72,3 @@ void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int l
ctx->y = y;
}
EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
-
-static int __init
-init_smbfs_common(void)
-{
- return 0;
-}
-static void __init
-exit_smbfs_common(void)
-{
-}
-
-module_init(init_smbfs_common)
-module_exit(exit_smbfs_common)
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
new file mode 100644
index 000000000000..0507aecfc669
--- /dev/null
+++ b/fs/smbfs_common/smb2pdu.h
@@ -0,0 +1,1604 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+#ifndef _COMMON_SMB2PDU_H
+#define _COMMON_SMB2PDU_H
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below. Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE 0x0000
+#define SMB2_SESSION_SETUP_HE 0x0001
+#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE 0x0003
+#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE 0x0005
+#define SMB2_CLOSE_HE 0x0006
+#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
+#define SMB2_READ_HE 0x0008
+#define SMB2_WRITE_HE 0x0009
+#define SMB2_LOCK_HE 0x000A
+#define SMB2_IOCTL_HE 0x000B
+#define SMB2_CANCEL_HE 0x000C
+#define SMB2_ECHO_HE 0x000D
+#define SMB2_QUERY_DIRECTORY_HE 0x000E
+#define SMB2_CHANGE_NOTIFY_HE 0x000F
+#define SMB2_QUERY_INFO_HE 0x0010
+#define SMB2_SET_INFO_HE 0x0011
+#define SMB2_OPLOCK_BREAK_HE 0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
+
+#define NUMBER_OF_SMB2_COMMANDS 0x0013
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+#define SMB2_NTLMV2_SESSKEY_SIZE 16
+#define SMB2_SIGNATURE_SIZE 16
+#define SMB2_HMACSHA256_SIZE 32
+#define SMB2_CMACAES_SIZE 16
+#define SMB3_GCM128_CRYPTKEY_SIZE 16
+#define SMB3_GCM256_CRYPTKEY_SIZE 32
+
+/*
+ * Size of the smb3 encryption/decryption keys
+ * This size is big enough to store any cipher key types.
+ */
+#define SMB3_ENC_DEC_KEY_SIZE 32
+
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE 16
+
+#define CIFS_CLIENT_CHALLENGE_SIZE 8
+
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
+/*
+ * The default wsize is 1M for SMB2 (and for some CIFS cases).
+ * find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_SIZE == 4k, this means we can
+ * fill a single wsize request with a single call.
+ */
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" : Must be Zero
+ * "BB" : BugBug, Something to check/review/analyze later
+ * "PDU" : "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define __SMB2_HEADER_STRUCTURE_SIZE 64
+#define SMB2_HEADER_STRUCTURE_SIZE \
+ cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
+
+/*
+ * SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
+
+/*
+ * Definitions for SMB2 Protocol Data Units (network frames)
+ *
+ * See MS-SMB2.PDF specification for protocol details.
+ * The Naming convention is the lower case version of the SMB2
+ * command code name for the struct. Note that structures must be packed.
+ *
+ */
+
+/* See MS-SMB2 section 2.2.1 */
+struct smb2_hdr {
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
+ __le16 StructureSize; /* 64 */
+ __le16 CreditCharge; /* MBZ */
+ __le32 Status; /* Error from server */
+ __le16 Command;
+ __le16 CreditRequest; /* CreditResponse */
+ __le32 Flags;
+ __le32 NextCommand;
+ __le64 MessageId;
+ union {
+ struct {
+ __le32 ProcessId;
+ __le32 TreeId;
+ } __packed SyncId;
+ __le64 AsyncId;
+ } __packed Id;
+ __le64 SessionId;
+ __u8 Signature[16];
+} __packed;
+
+struct smb2_pdu {
+ struct smb2_hdr hdr;
+ __le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+#define SMB2_ERROR_STRUCTURE_SIZE2 9
+#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2)
+
+struct smb2_err_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __u8 ErrorContextCount;
+ __u8 Reserved;
+ __le32 ByteCount; /* even if zero, at least one byte follows */
+ __u8 ErrorData[1]; /* variable length */
+} __packed;
+
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
+
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED 0x0001
+struct smb2_transform_hdr {
+ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
+ __u8 Signature[16];
+ __u8 Nonce[16];
+ __le32 OriginalMessageSize;
+ __u16 Reserved1;
+ __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
+ __le64 SessionId;
+} __packed;
+
+
+/* See MS-SMB2 2.2.42 */
+struct smb2_compression_transform_hdr_unchained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le16 Length; /* if chained it is length, else offset */
+} __packed;
+
+/* See MS-SMB2 2.2.42.1 */
+#define SMB2_COMPRESSION_FLAG_NONE 0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
+
+struct compression_payload_header {
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le32 Length; /* length of compressed playload including field below if present */
+ /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct smb2_compression_transform_hdr_chained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ /* struct compression_payload_header[] */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2.2 */
+struct compression_pattern_payload_v1 {
+ __le16 Pattern;
+ __le16 Reserved1;
+ __le16 Reserved2;
+ __le32 Repetitions;
+} __packed;
+
+/* See MS-SMB2 section 2.2.9.2 */
+/* Context Types */
+#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
+#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
+
+struct tree_connect_contexts {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Data[];
+} __packed;
+
+/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
+struct smb3_blob_data {
+ __le16 BlobSize;
+ __u8 BlobData[];
+} __packed;
+
+/* Valid values for Attr */
+#define SE_GROUP_MANDATORY 0x00000001
+#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
+#define SE_GROUP_ENABLED 0x00000004
+#define SE_GROUP_OWNER 0x00000008
+#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
+#define SE_GROUP_INTEGRITY 0x00000020
+#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
+#define SE_GROUP_RESOURCE 0x20000000
+#define SE_GROUP_LOGON_ID 0xC0000000
+
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+
+struct sid_array_data {
+ __le16 SidAttrCount;
+ /* SidAttrList - array of sid_attr_data structs */
+} __packed;
+
+struct luid_attr_data {
+
+} __packed;
+
+/*
+ * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
+ * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
+ */
+
+struct privilege_array_data {
+ __le16 PrivilegeCount;
+ /* array of privilege_data structs */
+} __packed;
+
+struct remoted_identity_tcon_context {
+ __le16 TicketType; /* must be 0x0001 */
+ __le16 TicketSize; /* total size of this struct */
+ __le16 User; /* offset to SID_ATTR_DATA struct with user info */
+ __le16 UserName; /* offset to null terminated Unicode username string */
+ __le16 Domain; /* offset to null terminated Unicode domain name */
+ __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
+ __le16 RestrictedGroups; /* similar to above */
+ __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
+ __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
+ __le16 Owner; /* offset to BLOB_DATA struct */
+ __le16 DefaultDacl; /* offset to BLOB_DATA struct */
+ __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
+ __le16 UserClaims; /* offset to BLOB_DATA struct */
+ __le16 DeviceClaims; /* offset to BLOB_DATA struct */
+ __u8 TicketInfo[]; /* variable length buf - remoted identity data */
+} __packed;
+
+struct smb2_tree_connect_req_extension {
+ __le32 TreeConnectContextOffset;
+ __le16 TreeConnectContextCount;
+ __u8 Reserved[10];
+ __u8 PathName[]; /* variable sized array */
+ /* followed by array of TreeConnectContexts */
+} __packed;
+
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
+#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
+#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
+
+struct smb2_tree_connect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 Flags; /* Flags in SMB3.1.1 */
+ __le16 PathOffset;
+ __le16 PathLength;
+ __u8 Buffer[1]; /* variable length */
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK 0x01
+#define SMB2_SHARE_TYPE_PIPE 0x02
+#define SMB2_SHARE_TYPE_PRINT 0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
+#define SHI1005_FLAGS_DFS 0x00000001
+#define SHI1005_FLAGS_DFS_ROOT 0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
+#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
+#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL 0x0014FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
+
+struct smb2_tree_connect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 16 */
+ __u8 ShareType; /* see below */
+ __u8 Reserved;
+ __le32 ShareFlags; /* see below */
+ __le32 Capabilities; /* see below */
+ __le32 MaximalAccess;
+} __packed;
+
+struct smb2_tree_disconnect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3
+ */
+/* SecurityMode flags */
+#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
+#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001)
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
+#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
+
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS 0x00000001
+#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND 0x00100000
+#define SMB2_LARGE_FILES 0x00200000
+
+#define SMB2_CLIENT_GUID_SIZE 16
+#define SMB2_CREATE_GUID_SIZE 16
+
+/* Dialects */
+#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+#define SMB2X_PROT_ID 0x02FF
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
+#define BAD_PROT_ID 0xFFFF
+
+#define SMB311_SALT_SIZE 32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+#define SMB2_PREAUTH_HASH_SIZE 64
+
+/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
+#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
+#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
+#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
+
+struct smb2_neg_context {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
+} __packed;
+
+/*
+ * SaltLength that the server send can be zero, so the only three required
+ * fields (all __le16) end up six bytes total, so the minimum context data len
+ * in the response is six bytes which accounts for
+ *
+ * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
+ */
+#define MIN_PREAUTH_CTXT_DATA_LEN 6
+
+struct smb2_preauth_neg_context {
+ __le16 ContextType; /* 1 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 HashAlgorithmCount; /* 1 */
+ __le16 SaltLength;
+ __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+ __u8 Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
+
+/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
+#define MIN_ENCRYPT_CTXT_DATA_LEN 4
+struct smb2_encryption_neg_context {
+ __le16 ContextType; /* 2 */
+ __le16 DataLength;
+ __le32 Reserved;
+ /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
+ __le16 Ciphers[];
+} __packed;
+
+/* See MS-SMB2 2.2.3.1.3 */
+#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
+
+struct smb2_compression_capabilities_context {
+ __le16 ContextType; /* 3 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 CompressionAlgorithmCount;
+ __le16 Padding;
+ __le32 Flags;
+ __le16 CompressionAlgorithms[3];
+ __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
+ /* Check if pad needed */
+} __packed;
+
+/*
+ * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
+ * Its struct simply contains NetName, an array of Unicode characters
+ */
+struct smb2_netname_neg_context {
+ __le16 ContextType; /* 5 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+/*
+ * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
+ * and 2.2.4.1.5
+ */
+
+/* Flags */
+#define SMB2_ACCEPT_TRANSPORT_LEVEL_SECURITY 0x00000001
+
+struct smb2_transport_capabilities_context {
+ __le16 ContextType; /* 6 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le32 Flags;
+ __u32 Pad;
+} __packed;
+
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.6
+ */
+
+/* RDMA Transform IDs */
+#define SMB2_RDMA_TRANSFORM_NONE 0x0000
+#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
+
+struct smb2_rdma_transform_capabilities_context {
+ __le16 ContextType; /* 7 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le16 TransformCount;
+ __u16 Reserved1;
+ __u32 Reserved2;
+ __le16 RDMATransformIds[];
+} __packed;
+
+/*
+ * For signing capabilities context see MS-SMB2 2.2.3.1.7
+ * and 2.2.4.1.7
+ */
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256 0
+#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0)
+#define SIGNING_ALG_AES_CMAC 1
+#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1)
+#define SIGNING_ALG_AES_GMAC 2
+#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2)
+
+struct smb2_signing_capabilities {
+ __le16 ContextType; /* 8 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 SigningAlgorithmCount;
+ __le16 SigningAlgorithms[];
+ /* Followed by padding to 8 byte boundary (required by some servers) */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN 16
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
+} __packed;
+
+struct smb2_negotiate_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 DialectCount;
+ __le16 SecurityMode;
+ __le16 Reserved; /* MBZ */
+ __le32 Capabilities;
+ __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
+ /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+ __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
+ __le16 Reserved2;
+ __le16 Dialects[];
+} __packed;
+
+struct smb2_negotiate_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 65 */
+ __le16 SecurityMode;
+ __le16 DialectRevision;
+ __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
+ __u8 ServerGUID[16];
+ __le32 Capabilities;
+ __le32 MaxTransactSize;
+ __le32 MaxReadSize;
+ __le32 MaxWriteSize;
+ __le64 SystemTime; /* MBZ */
+ __le64 ServerStartTime;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5
+ */
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
+struct smb2_sess_setup_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 25 */
+ __u8 Flags;
+ __u8 SecurityMode;
+ __le32 Capabilities;
+ __le32 Channel;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le64 PreviousSessionId;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
+#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001)
+#define SMB2_SESSION_FLAG_IS_NULL 0x0002
+#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002)
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
+
+struct smb2_sess_setup_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 SessionFlags;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_LOGOFF See MS-SMB2 section 2.2.7
+ */
+struct smb2_logoff_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_CLOSE See MS-SMB2 section 2.2.15
+ */
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
+struct smb2_close_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Flags;
+ __le32 Reserved;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+} __packed;
+
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
+struct smb2_close_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* 60 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile;
+ __le32 Attributes;
+} __packed;
+
+
+/*
+ * SMB2_READ See MS-SMB2 section 2.2.19
+ */
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED 0x01
+#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001)
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
+#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003)
+
+/* SMB2 read request without RFC1001 length at the beginning */
+struct smb2_read_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __u8 Padding; /* offset from start of SMB2 header to place read */
+ __u8 Flags; /* MBZ unless SMB3.02 or later */
+ __le32 Length;
+ __le64 Offset;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 MinimumCount;
+ __le32 Channel; /* MBZ except for SMB3 or later */
+ __le32 RemainingBytes;
+ __le16 ReadChannelInfoOffset;
+ __le16 ReadChannelInfoLength;
+ __u8 Buffer[1];
+} __packed;
+
+/* Read flags */
+#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000)
+#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001)
+
+struct smb2_read_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_WRITE See MS-SMB2 section 2.2.21
+ */
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
+
+struct smb2_write_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 DataOffset; /* offset from start of SMB2 header to write data */
+ __le32 Length;
+ __le64 Offset;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 Channel; /* MBZ unless SMB3.02 or later */
+ __le32 RemainingBytes;
+ __le16 WriteChannelInfoOffset;
+ __le16 WriteChannelInfoLength;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __u32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_FLUSH See MS-SMB2 section 2.2.17
+ */
+struct smb2_flush_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Reserved1;
+ __le32 Reserved2;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+} __packed;
+
+struct smb2_flush_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Reserved;
+} __packed;
+
+#define SMB2_LOCKFLAG_SHARED 0x0001
+#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002
+#define SMB2_LOCKFLAG_UNLOCK 0x0004
+#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
+#define SMB2_LOCKFLAG_MASK 0x0007
+
+struct smb2_lock_element {
+ __le64 Offset;
+ __le64 Length;
+ __le32 Flags;
+ __le32 Reserved;
+} __packed;
+
+struct smb2_lock_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 48 */
+ __le16 LockCount;
+ /*
+ * The least significant four bits are the index, the other 28 bits are
+ * the lock sequence number (0 to 64). See MS-SMB2 2.2.26
+ */
+ __le32 LockSequenceNumber;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ /* Followed by at least one */
+ struct smb2_lock_element locks[1];
+} __packed;
+
+struct smb2_lock_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_echo_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+struct smb2_echo_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+/*
+ * Valid FileInformation classes for query directory
+ *
+ * Note that these are a subset of the (file) QUERY_INFO levels defined
+ * later in this file (but since QUERY_DIRECTORY uses equivalent numbers
+ * we do not redefine them here)
+ *
+ * FileDirectoryInfomation 0x01
+ * FileFullDirectoryInformation 0x02
+ * FileIdFullDirectoryInformation 0x26
+ * FileBothDirectoryInformation 0x03
+ * FileIdBothDirectoryInformation 0x25
+ * FileNamesInformation 0x0C
+ * FileIdExtdDirectoryInformation 0x3C
+ */
+
+/* search (query_directory) Flags field */
+#define SMB2_RESTART_SCANS 0x01
+#define SMB2_RETURN_SINGLE_ENTRY 0x02
+#define SMB2_INDEX_SPECIFIED 0x04
+#define SMB2_REOPEN 0x10
+
+struct smb2_query_directory_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 FileInformationClass;
+ __u8 Flags;
+ __le32 FileIndex;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le16 FileNameOffset;
+ __le16 FileNameLength;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_directory_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/*
+ * Maximum number of iovs we need for a set-info request.
+ * The largest one is rename/hardlink
+ * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
+ * [1] : path
+ * [2] : compound padding
+ */
+#define SMB2_SET_INFO_IOV_SIZE 3
+
+struct smb2_set_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 BufferLength;
+ __le16 BufferOffset;
+ __u16 Reserved;
+ __le32 AdditionalInformation;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_set_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 2 */
+} __packed;
+
+/*
+ * SMB2_NOTIFY See MS-SMB2 section 2.2.35
+ */
+/* notify flags */
+#define SMB2_WATCH_TREE 0x0001
+
+/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
+#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
+#define FILE_NOTIFY_CHANGE_EA 0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+/* SMB2 Notify Action Flags */
+#define FILE_ACTION_ADDED 0x00000001
+#define FILE_ACTION_REMOVED 0x00000002
+#define FILE_ACTION_MODIFIED 0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
+#define FILE_ACTION_ADDED_STREAM 0x00000006
+#define FILE_ACTION_REMOVED_STREAM 0x00000007
+#define FILE_ACTION_MODIFIED_STREAM 0x00000008
+#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
+
+struct smb2_change_notify_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Flags;
+ __le32 OutputBufferLength;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 CompletionFilter;
+ __u32 Reserved;
+} __packed;
+
+struct smb2_change_notify_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1]; /* array of file notify structs */
+} __packed;
+
+
+/*
+ * SMB2_CREATE See MS-SMB2 section 2.2.13
+ */
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE 0x00
+#define SMB2_OPLOCK_LEVEL_II 0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
+#define SMB2_OPLOCK_LEVEL_BATCH 0x09
+#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
+
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
+#define IL_ANONYMOUS cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION cpu_to_le32(0x00000002)
+#define IL_DELEGATE cpu_to_le32(0x00000003)
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY 0x00000001
+#define FILE_ATTRIBUTE_HIDDEN 0x00000002
+#define FILE_ATTRIBUTE_SYSTEM 0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
+#define FILE_ATTRIBUTE_NORMAL 0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
+#define FILE_ATTRIBUTE_OFFLINE 0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
+#define FILE_ATTRIBUTE__MASK 0x00007FB7
+
+#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001)
+#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002)
+#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004)
+#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010)
+#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020)
+#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080)
+#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100)
+#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200)
+#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400)
+#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800)
+#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000)
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000)
+#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000)
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000)
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
+#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7)
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
+#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
+#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
+#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040)
+#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
+#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF)
+
+
+#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \
+ FILE_READ_EA_LE | \
+ FILE_GENERIC_READ_LE)
+#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \
+ FILE_APPEND_DATA_LE | \
+ FILE_WRITE_EA_LE | \
+ FILE_WRITE_ATTRIBUTES_LE | \
+ FILE_GENERIC_WRITE_LE)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE cpu_to_le32(0x00000002)
+#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
+#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007)
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
+ | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+ | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+ | FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+ | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
+#define SMB2_CREATE_REQUEST_LEASE "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
+
+struct create_context {
+ __le32 Next;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le16 Reserved;
+ __le16 DataOffset;
+ __le32 DataLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u8 SecurityFlags;
+ __u8 RequestedOplockLevel;
+ __le32 ImpersonationLevel;
+ __le64 SmbCreateFlags;
+ __le64 Reserved;
+ __le32 DesiredAccess;
+ __le32 FileAttributes;
+ __le32 ShareAccess;
+ __le32 CreateDisposition;
+ __le32 CreateOptions;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 89 */
+ __u8 OplockLevel;
+ __u8 Flags; /* 0x01 if reparse point */
+ __le32 CreateAction;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndofFile;
+ __le32 FileAttributes;
+ __le32 Reserved2;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct create_posix {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 Mode;
+ __u32 Reserved;
+} __packed;
+
+#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
+
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
+
+#define SMB2_LEASE_KEY_SIZE 16
+
+struct lease_context {
+ __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+} __packed;
+
+struct lease_context_v2 {
+ __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+ __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le16 Epoch;
+ __le16 Reserved;
+} __packed;
+
+struct create_lease {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context lcontext;
+} __packed;
+
+struct create_lease_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context_v2 lcontext;
+ __u8 Pad[4];
+} __packed;
+
+/* See MS-SMB2 2.2.31 and 2.2.32 */
+struct smb2_ioctl_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __le16 Reserved; /* offset from start of SMB2 header to write data */
+ __le32 CtlCode;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 MaxInputResponse;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 MaxOutputResponse;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_ioctl_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[];
+} __packed;
+
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+ __le64 FileOffset;
+ __le64 BeyondFinalZero;
+} __packed;
+
+/* Reparse structures - see MS-FSCC 2.1.2 */
+
+/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
+struct reparse_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 DataBuffer[]; /* Variable Length */
+} __packed;
+
+struct reparse_guid_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 ReparseGuid[16];
+ __u8 DataBuffer[]; /* Variable Length */
+} __packed;
+
+struct reparse_mount_point_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __u8 PathBuffer[]; /* Variable Length */
+} __packed;
+
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
+struct reparse_symlink_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __le32 Flags;
+ __u8 PathBuffer[]; /* Variable Length */
+} __packed;
+
+/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+
+struct validate_negotiate_info_req {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 DialectCount;
+ __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 Dialect; /* Dialect in use for the connection */
+} __packed;
+
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
+/* Possible InfoType values */
+#define SMB2_O_INFO_FILE 0x01
+#define SMB2_O_INFO_FILESYSTEM 0x02
+#define SMB2_O_INFO_SECURITY 0x03
+#define SMB2_O_INFO_QUOTA 0x04
+
+/* SMB2 Query Info see MS-SMB2 (2.2.37) or MS-DTYP */
+
+/* List of QUERY INFO levels (those also valid for QUERY_DIR are noted below */
+#define FILE_DIRECTORY_INFORMATION 1 /* also for QUERY_DIR */
+#define FILE_FULL_DIRECTORY_INFORMATION 2 /* also for QUERY_DIR */
+#define FILE_BOTH_DIRECTORY_INFORMATION 3 /* also for QUERY_DIR */
+#define FILE_BASIC_INFORMATION 4
+#define FILE_STANDARD_INFORMATION 5
+#define FILE_INTERNAL_INFORMATION 6
+#define FILE_EA_INFORMATION 7
+#define FILE_ACCESS_INFORMATION 8
+#define FILE_NAME_INFORMATION 9
+#define FILE_RENAME_INFORMATION 10
+#define FILE_LINK_INFORMATION 11
+#define FILE_NAMES_INFORMATION 12 /* also for QUERY_DIR */
+#define FILE_DISPOSITION_INFORMATION 13
+#define FILE_POSITION_INFORMATION 14
+#define FILE_FULL_EA_INFORMATION 15
+#define FILE_MODE_INFORMATION 16
+#define FILE_ALIGNMENT_INFORMATION 17
+#define FILE_ALL_INFORMATION 18
+#define FILE_ALLOCATION_INFORMATION 19
+#define FILE_END_OF_FILE_INFORMATION 20
+#define FILE_ALTERNATE_NAME_INFORMATION 21
+#define FILE_STREAM_INFORMATION 22
+#define FILE_PIPE_INFORMATION 23
+#define FILE_PIPE_LOCAL_INFORMATION 24
+#define FILE_PIPE_REMOTE_INFORMATION 25
+#define FILE_MAILSLOT_QUERY_INFORMATION 26
+#define FILE_MAILSLOT_SET_INFORMATION 27
+#define FILE_COMPRESSION_INFORMATION 28
+#define FILE_OBJECT_ID_INFORMATION 29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION 31
+#define FILE_QUOTA_INFORMATION 32
+#define FILE_REPARSE_POINT_INFORMATION 33
+#define FILE_NETWORK_OPEN_INFORMATION 34
+#define FILE_ATTRIBUTE_TAG_INFORMATION 35
+#define FILE_TRACKING_INFORMATION 36
+#define FILEID_BOTH_DIRECTORY_INFORMATION 37 /* also for QUERY_DIR */
+#define FILEID_FULL_DIRECTORY_INFORMATION 38 /* also for QUERY_DIR */
+#define FILE_VALID_DATA_LENGTH_INFORMATION 39
+#define FILE_SHORT_NAME_INFORMATION 40
+#define FILE_SFIO_RESERVE_INFORMATION 44
+#define FILE_SFIO_VOLUME_INFORMATION 45
+#define FILE_HARD_LINK_INFORMATION 46
+#define FILE_NORMALIZED_NAME_INFORMATION 48
+#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
+#define FILE_STANDARD_LINK_INFORMATION 54
+#define FILE_ID_INFORMATION 59
+#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */
+/* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */
+#define SMB_FIND_FILE_POSIX_INFO 0x064
+
+/* Security info type additionalinfo flags. */
+#define OWNER_SECINFO 0x00000001
+#define GROUP_SECINFO 0x00000002
+#define DACL_SECINFO 0x00000004
+#define SACL_SECINFO 0x00000008
+#define LABEL_SECINFO 0x00000010
+#define ATTRIBUTE_SECINFO 0x00000020
+#define SCOPE_SECINFO 0x00000040
+#define BACKUP_SECINFO 0x00010000
+#define UNPROTECTED_SACL_SECINFO 0x10000000
+#define UNPROTECTED_DACL_SECINFO 0x20000000
+#define PROTECTED_SACL_SECINFO 0x40000000
+#define PROTECTED_DACL_SECINFO 0x80000000
+
+/* Flags used for FileFullEAinfo */
+#define SL_RESTART_SCAN 0x00000001
+#define SL_RETURN_SINGLE_ENTRY 0x00000002
+#define SL_INDEX_SPECIFIED 0x00000004
+
+struct smb2_query_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 41 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 OutputBufferLength;
+ __le16 InputBufferOffset;
+ __u16 Reserved;
+ __le32 InputBufferLength;
+ __le32 AdditionalInformation;
+ __le32 Flags;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/*
+ * PDU query infolevel structure definitions
+ */
+
+struct file_allocated_range_buffer {
+ __le64 file_offset;
+ __le64 length;
+} __packed;
+
+struct smb2_file_internal_info {
+ __le64 IndexNumber;
+} __packed; /* level 6 Query */
+
+struct smb2_file_rename_info { /* encoding of request for level 10 */
+ __u8 ReplaceIfExists; /* 1 = replace existing target with new */
+ /* 0 = fail if target already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[]; /* New name to be assigned */
+ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */
+} __packed; /* level 10 Set */
+
+struct smb2_file_link_info { /* encoding of request for level 11 */
+ __u8 ReplaceIfExists; /* 1 = replace existing link with new */
+ /* 0 = fail if link already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[]; /* Name to be assigned to new link */
+} __packed; /* level 11 Set */
+
+/*
+ * This level 18, although with struct with same name is different from cifs
+ * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
+ * CurrentByteOffset.
+ */
+struct smb2_file_all_info { /* data block encoding of response to level 18 */
+ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 Attributes;
+ __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile; /* size ie offset to first free byte in file */
+ __le32 NumberOfLinks; /* hard links */
+ __u8 DeletePending;
+ __u8 Directory;
+ __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
+ __le64 IndexNumber;
+ __le32 EASize;
+ __le32 AccessFlags;
+ __le64 CurrentByteOffset;
+ __le32 Mode;
+ __le32 AlignmentRequirement;
+ __le32 FileNameLength;
+ char FileName[1];
+} __packed; /* level 18 Query */
+
+struct smb2_file_eof_info { /* encoding of request for level 10 */
+ __le64 EndOfFile; /* new end of file value */
+} __packed; /* level 20 Set */
+
+/* Level 100 query info */
+struct smb311_posix_qinfo {
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ u8 Sids[];
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+/* File System Information Classes */
+#define FS_VOLUME_INFORMATION 1 /* Query */
+#define FS_LABEL_INFORMATION 2 /* Set */
+#define FS_SIZE_INFORMATION 3 /* Query */
+#define FS_DEVICE_INFORMATION 4 /* Query */
+#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
+#define FS_CONTROL_INFORMATION 6 /* Query, Set */
+#define FS_FULL_SIZE_INFORMATION 7 /* Query */
+#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
+#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
+#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
+#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
+
+struct smb2_fs_full_size_info {
+ __le64 TotalAllocationUnits;
+ __le64 CallerAvailableAllocationUnits;
+ __le64 ActualAvailableAllocationUnits;
+ __le32 SectorsPerAllocationUnit;
+ __le32 BytesPerSector;
+} __packed;
+
+#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+ __le32 LogicalBytesPerSector;
+ __le32 PhysicalBytesPerSectorForAtomicity;
+ __le32 PhysicalBytesPerSectorForPerf;
+ __le32 FSEffPhysicalBytesPerSectorForAtomicity;
+ __le32 Flags;
+ __le32 ByteOffsetForSectorAlignment;
+ __le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
+/* File System Control Information */
+struct smb2_fs_control_info {
+ __le64 FreeSpaceStartFiltering;
+ __le64 FreeSpaceThreshold;
+ __le64 FreeSpaceStopFiltering;
+ __le64 DefaultQuotaThreshold;
+ __le64 DefaultQuotaLimit;
+ __le32 FileSystemControlFlags;
+ __le32 Padding;
+} __packed;
+
+/* volume info struct - see MS-FSCC 2.5.9 */
+#define MAX_VOL_LABEL_LEN 32
+struct smb3_fs_vol_info {
+ __le64 VolumeCreationTime;
+ __u32 VolumeSerialNumber;
+ __le32 VolumeLabelLength; /* includes trailing null */
+ __u8 SupportsObjects; /* True if eg like NTFS, supports objects */
+ __u8 Reserved;
+ __u8 VolumeLabel[]; /* variable len */
+} __packed;
+
+/* See MS-SMB2 2.2.23 through 2.2.25 */
+struct smb2_oplock_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __u8 OplockLevel;
+ __u8 Reserved;
+ __le32 Reserved2;
+ __u64 PersistentFid;
+ __u64 VolatileFid;
+} __packed;
+
+#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
+
+struct smb2_lease_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 44 */
+ __le16 Epoch;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 CurrentLeaseState;
+ __le32 NewLeaseState;
+ __le32 BreakReason;
+ __le32 AccessMaskHint;
+ __le32 ShareMaskHint;
+} __packed;
+
+struct smb2_lease_ack {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 Reserved;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 LeaseState;
+ __le64 LeaseDuration;
+} __packed;
+
+#define OP_BREAK_STRUCT_SIZE_20 24
+#define OP_BREAK_STRUCT_SIZE_21 36
+#endif /* _COMMON_SMB2PDU_H */
diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h
index 926f87cd6af0..d51939c43ad7 100644
--- a/fs/smbfs_common/smbfsctl.h
+++ b/fs/smbfs_common/smbfsctl.h
@@ -95,8 +95,10 @@
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
#define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */
+#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380
#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
+#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
diff --git a/fs/splice.c b/fs/splice.c
index 5dbce4dcc1a7..047b79db8eb5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -46,45 +46,45 @@
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->page);
struct address_space *mapping;
- lock_page(page);
+ folio_lock(folio);
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (mapping) {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
/*
* At least for ext2 with nobh option, we need to wait on
- * writeback completing on this page, since we'll remove it
+ * writeback completing on this folio, since we'll remove it
* from the pagecache. Otherwise truncate wont wait on the
- * page, allowing the disk blocks to be reused by someone else
+ * folio, allowing the disk blocks to be reused by someone else
* before we actually wrote our data to them. fs corruption
* ensues.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
* and return good.
*/
- if (remove_mapping(mapping, page)) {
+ if (remove_mapping(mapping, folio)) {
buf->flags |= PIPE_BUF_FLAG_LRU;
return true;
}
}
/*
- * Raced with truncate or failed to remove page from current
+ * Raced with truncate or failed to remove folio from current
* address space, unlock and return failure.
*/
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return false;
}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2db8bcf7ff85..622c844f6d11 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS)
- bio = bio_alloc(GFP_NOIO, page_count);
- else
+ if (page_count <= BIO_MAX_VECS) {
+ bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
+ } else {
bio = bio_kmalloc(GFP_NOIO, page_count);
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_opf = REQ_OP_READ;
+ }
if (!bio)
return -ENOMEM;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = READ;
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60d6951915f4..4f74abbc1a54 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -16,6 +16,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
@@ -28,6 +29,7 @@
#include <linux/module.h>
#include <linux/magic.h>
#include <linux/xattr.h>
+#include <linux/backing-dev.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -111,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
return decompressor;
}
+static int squashfs_bdi_init(struct super_block *sb)
+{
+ int err;
+ unsigned int major = MAJOR(sb->s_dev);
+ unsigned int minor = MINOR(sb->s_dev);
+
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+
+ err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
+ if (err)
+ return err;
+
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+
+ return 0;
+}
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
@@ -126,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Entered squashfs_fill_superblock\n");
+ /*
+ * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
+ * squashfs, I/O is not deferred, it is done immediately in readpage,
+ * which means the user would always have to wait their own I/O. So the effect
+ * of readahead is very weak for squashfs. squashfs_bdi_init will set
+ * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
+ * squashfs.
+ */
+ err = squashfs_bdi_init(sb);
+ if (err) {
+ errorf(fc, "squashfs init bdi failed");
+ return err;
+ }
+
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n");
@@ -179,8 +213,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
- if (msblk->bytes_used < 0 || msblk->bytes_used >
- i_size_read(sb->s_bdev->bd_inode))
+ if (msblk->bytes_used < 0 ||
+ msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
goto failed_mount;
/* Check block size for sanity */
@@ -550,7 +584,7 @@ static void __exit exit_squashfs_fs(void)
static struct inode *squashfs_alloc_inode(struct super_block *sb)
{
struct squashfs_inode_info *ei =
- kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL);
return ei ? &ei->vfs_inode : NULL;
}
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index 0015cf8b5582..c40445dbf38c 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -34,7 +34,7 @@ static void *zstd_init(struct squashfs_sb_info *msblk, void *buff)
goto failed;
wksp->window_size = max_t(size_t,
msblk->block_size, SQUASHFS_METADATA_SIZE);
- wksp->mem_size = ZSTD_DStreamWorkspaceBound(wksp->window_size);
+ wksp->mem_size = zstd_dstream_workspace_bound(wksp->window_size);
wksp->mem = vmalloc(wksp->mem_size);
if (wksp->mem == NULL)
goto failed;
@@ -63,15 +63,15 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
struct squashfs_page_actor *output)
{
struct workspace *wksp = strm;
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
size_t total_out = 0;
int error = 0;
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+ zstd_in_buffer in_buf = { NULL, 0, 0 };
+ zstd_out_buffer out_buf = { NULL, 0, 0 };
struct bvec_iter_all iter_all = {};
struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
- stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size);
+ stream = zstd_init_dstream(wksp->window_size, wksp->mem, wksp->mem_size);
if (!stream) {
ERROR("Failed to initialize zstd decompressor\n");
@@ -116,14 +116,14 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
total_out -= out_buf.pos;
- zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+ zstd_err = zstd_decompress_stream(stream, &out_buf, &in_buf);
total_out += out_buf.pos; /* add the additional data produced */
if (zstd_err == 0)
break;
- if (ZSTD_isError(zstd_err)) {
+ if (zstd_is_error(zstd_err)) {
ERROR("zstd decompression error: %d\n",
- (int)ZSTD_getErrorCode(zstd_err));
+ (int)zstd_get_error_code(zstd_err));
error = -EIO;
break;
}
diff --git a/fs/stat.c b/fs/stat.c
index 28d2020ba1f4..5c2c94464e8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -184,6 +184,20 @@ int vfs_fstat(int fd, struct kstat *stat)
return error;
}
+int getname_statx_lookup_flags(int flags)
+{
+ int lookup_flags = 0;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ return lookup_flags;
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -199,26 +213,19 @@ int vfs_fstat(int fd, struct kstat *stat)
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-static int vfs_statx(int dfd, const char __user *filename, int flags,
+static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned lookup_flags = 0;
+ unsigned int lookup_flags = getname_statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
AT_STATX_SYNC_TYPE))
return -EINVAL;
- if (!(flags & AT_SYMLINK_NOFOLLOW))
- lookup_flags |= LOOKUP_FOLLOW;
- if (!(flags & AT_NO_AUTOMOUNT))
- lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
retry:
- error = user_path_at(dfd, filename, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
@@ -240,8 +247,15 @@ out:
int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ int ret;
+ int statx_flags = flags | AT_NO_AUTOMOUNT;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
+ putname(name);
+
+ return ret;
}
#ifdef __ARCH_WANT_OLD_STAT
@@ -334,9 +348,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -345,7 +356,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -353,7 +366,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -363,7 +376,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -602,7 +615,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer)
{
struct kstat stat;
@@ -636,7 +649,14 @@ SYSCALL_DEFINE5(statx,
unsigned int, mask,
struct statx __user *, buffer)
{
- return do_statx(dfd, filename, flags, mask, buffer);
+ int ret;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ ret = do_statx(dfd, name, flags, mask, buffer);
+ putname(name);
+
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -644,11 +664,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -658,7 +680,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
diff --git a/fs/super.c b/fs/super.c
index bcef3a6f4c4b..f1d4a193602d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,7 +31,6 @@
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
-#include <linux/cleancache.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
@@ -260,7 +259,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_time_gran = 1000000000;
s->s_time_min = TIME64_MIN;
s->s_time_max = TIME64_MAX;
- s->cleancache_poolid = CLEANCACHE_NO_POOL;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.scan_objects = super_cache_scan;
@@ -330,7 +328,6 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
- cleancache_invalidate_fs(s);
unregister_shrinker(&s->s_shrink);
fs->kill_sb(s);
@@ -476,6 +473,8 @@ void generic_shutdown_super(struct super_block *sb)
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
if (sb->s_bdi != &noop_backing_dev_info) {
+ if (sb->s_iflags & SB_I_PERSB_BDI)
+ bdi_unregister(sb->s_bdi);
bdi_put(sb->s_bdi);
sb->s_bdi = &noop_backing_dev_info;
}
@@ -1421,8 +1420,8 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
-static int reconfigure_single(struct super_block *s,
- int flags, void *data)
+int reconfigure_single(struct super_block *s,
+ int flags, void *data)
{
struct fs_context *fc;
int ret;
@@ -1562,6 +1561,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
}
WARN_ON(sb->s_bdi != &noop_backing_dev_info);
sb->s_bdi = bdi;
+ sb->s_iflags |= SB_I_PERSB_BDI;
return 0;
}
@@ -1616,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
-static void sb_freeze_unlock(struct super_block *sb)
+static void sb_freeze_unlock(struct super_block *sb, int level)
{
- int level;
-
- for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ for (level--; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
}
@@ -1691,7 +1689,14 @@ int freeze_super(struct super_block *sb)
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
- sync_filesystem(sb);
+ ret = sync_filesystem(sb);
+ if (ret) {
+ sb->s_writers.frozen = SB_UNFROZEN;
+ sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
+ wake_up(&sb->s_writers.wait_unfrozen);
+ deactivate_locked_super(sb);
+ return ret;
+ }
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
@@ -1703,7 +1708,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1748,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb)
}
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..c7690016453e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
* High-level sync()-related operations
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -22,32 +23,13 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
- if (wait)
- sync_inodes_sb(sb);
- else
- writeback_inodes_sb(sb, WB_REASON_SYNC);
-
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, wait);
- return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
*/
int sync_filesystem(struct super_block *sb)
{
- int ret;
+ int ret = 0;
/*
* We need to be protected against the filesystem going from
@@ -61,10 +43,31 @@ int sync_filesystem(struct super_block *sb)
if (sb_rdonly(sb))
return 0;
- ret = __sync_filesystem(sb, 0);
- if (ret < 0)
+ /*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+ * to submit I/O for these buffers via sync_blockdev(). This also
+ * speeds up the wait == 1 case since in that case write_inode()
+ * methods call sync_dirty_buffer() and thus effectively write one block
+ * at a time.
+ */
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 0);
+ if (ret)
+ return ret;
+ }
+ ret = sync_blockdev_nowait(sb->s_bdev);
+ if (ret)
return ret;
- return __sync_filesystem(sb, 1);
+
+ sync_inodes_sb(sb);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
+ return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
@@ -81,21 +84,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
sb->s_op->sync_fs(sb, *(int *)arg);
}
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
- filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
- /*
- * We keep the error status of individual mapping so that
- * applications can catch the writeback error using fsync(2).
- * See filemap_fdatawait_keep_errors() for details.
- */
- filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
/*
* Sync everything. We start by waking flusher threads so that most of
* writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +102,8 @@ void ksys_sync(void)
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
- iterate_bdevs(fdatawait_one_bdev, NULL);
+ sync_bdevs(false);
+ sync_bdevs(true);
if (unlikely(laptop_mode))
laptop_sync_completion();
}
@@ -136,10 +124,10 @@ static void do_sync_work(struct work_struct *work)
*/
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
}
diff --git a/fs/sysctls.c b/fs/sysctls.c
new file mode 100644
index 000000000000..c701273c9432
--- /dev/null
+++ b/fs/sysctls.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/sys/fs shared sysctls
+ *
+ * These sysctls are shared between different filesystems.
+ */
+#include <linux/init.h>
+#include <linux/sysctl.h>
+
+static struct ctl_table fs_shared_sysctls[] = {
+ {
+ .procname = "overflowuid",
+ .data = &fs_overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &fs_overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+ { }
+};
+
+DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls);
+
+static int __init init_fs_sysctls(void)
+{
+ return register_sysctl_base(fs);
+}
+
+early_initcall(init_fs_sysctls);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 59dffd5ca517..b6b6796e1616 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -56,8 +56,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
kobject_get_ownership(kobj, &uid, &gid);
- kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
- S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
+ kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid,
kobj, ns);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d019d6ac6ad0..a12ac0356c69 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,6 +45,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
ssize_t count;
char *buf;
+ if (WARN_ON_ONCE(!ops->show))
+ return -EINVAL;
+
/* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
count = seq_get_buf(sf, &buf);
if (count < PAGE_SIZE) {
@@ -53,15 +56,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
}
memset(buf, 0, PAGE_SIZE);
- /*
- * Invoke show(). Control may reach here via seq file lseek even
- * if @ops->show() isn't implemented.
- */
- if (ops->show) {
- count = ops->show(kobj, of->kn->priv, buf);
- if (count < 0)
- return count;
- }
+ count = ops->show(kobj, of->kn->priv, buf);
+ if (count < 0)
+ return count;
/*
* The code works fine with PAGE_SIZE return but it's likely to
@@ -255,67 +252,82 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t mode, kuid_t uid, kgid_t gid, const void *ns)
+ const struct attribute *attr, umode_t mode, kuid_t uid,
+ kgid_t gid, const void *ns)
{
+ struct kobject *kobj = parent->priv;
+ const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
struct lock_class_key *key = NULL;
- const struct kernfs_ops *ops;
+ const struct kernfs_ops *ops = NULL;
struct kernfs_node *kn;
- loff_t size;
-
- if (!is_bin) {
- struct kobject *kobj = parent->priv;
- const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
-
- /* every kobject with an attribute needs a ktype assigned */
- if (WARN(!sysfs_ops, KERN_ERR
- "missing sysfs attribute operations for kobject: %s\n",
- kobject_name(kobj)))
- return -EINVAL;
-
- if (sysfs_ops->show && sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_rw;
- else
- ops = &sysfs_file_kfops_rw;
- } else if (sysfs_ops->show) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_ro;
- else
- ops = &sysfs_file_kfops_ro;
- } else if (sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_wo;
- else
- ops = &sysfs_file_kfops_wo;
- } else
- ops = &sysfs_file_kfops_empty;
-
- size = PAGE_SIZE;
+
+ /* every kobject with an attribute needs a ktype assigned */
+ if (WARN(!sysfs_ops, KERN_ERR
+ "missing sysfs attribute operations for kobject: %s\n",
+ kobject_name(kobj)))
+ return -EINVAL;
+
+ if (mode & SYSFS_PREALLOC) {
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_prealloc_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_wo;
} else {
- struct bin_attribute *battr = (void *)attr;
-
- if (battr->mmap)
- ops = &sysfs_bin_kfops_mmap;
- else if (battr->read && battr->write)
- ops = &sysfs_bin_kfops_rw;
- else if (battr->read)
- ops = &sysfs_bin_kfops_ro;
- else if (battr->write)
- ops = &sysfs_bin_kfops_wo;
- else
- ops = &sysfs_file_kfops_empty;
-
- size = battr->size;
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_file_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_file_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_file_kfops_wo;
}
+ if (!ops)
+ ops = &sysfs_file_kfops_empty;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (!attr->ignore_lockdep)
key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
- size, ops, (void *)attr, ns, key);
+ PAGE_SIZE, ops, (void *)attr, ns, key);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, attr->name);
+ return PTR_ERR(kn);
+ }
+ return 0;
+}
+
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns)
+{
+ const struct attribute *attr = &battr->attr;
+ struct lock_class_key *key = NULL;
+ const struct kernfs_ops *ops;
+ struct kernfs_node *kn;
+
+ if (battr->mmap)
+ ops = &sysfs_bin_kfops_mmap;
+ else if (battr->read && battr->write)
+ ops = &sysfs_bin_kfops_rw;
+ else if (battr->read)
+ ops = &sysfs_bin_kfops_ro;
+ else if (battr->write)
+ ops = &sysfs_bin_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_empty;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!attr->ignore_lockdep)
+ key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+
+ kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
+ battr->size, ops, (void *)attr, ns, key);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, attr->name);
@@ -340,9 +352,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
- uid, gid, ns);
-
+ return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -385,8 +395,8 @@ int sysfs_add_file_to_group(struct kobject *kobj,
return -ENOENT;
kobject_get_ownership(kobj, &uid, &gid);
- error = sysfs_add_file_mode_ns(parent, attr, false,
- attr->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid,
+ NULL);
kernfs_put(parent);
return error;
@@ -555,8 +565,8 @@ int sysfs_create_bin_file(struct kobject *kobj,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
- attr->attr.mode, uid, gid, NULL);
+ return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid,
+ gid, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
@@ -693,19 +703,6 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
ktype = get_ktype(kobj);
if (ktype) {
- struct attribute **kattr;
-
- /*
- * Change owner of the default attributes associated with the
- * ktype of @kobj.
- */
- for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
- error = sysfs_file_change_owner(kobj, (*kattr)->name,
- kuid, kgid);
- if (error)
- return error;
- }
-
/*
* Change owner of the default groups associated with the
* ktype of @kobj.
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f29d62004527..eeb0e3099421 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -61,8 +61,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*attr)->name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
+ gid, NULL);
if (unlikely(error))
break;
}
@@ -90,10 +90,9 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*bin_attr)->attr.name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent,
- &(*bin_attr)->attr, true,
- mode,
- uid, gid, NULL);
+ error = sysfs_add_bin_file_mode_ns(parent, *bin_attr,
+ mode, uid, gid,
+ NULL);
if (error)
break;
}
@@ -340,8 +339,8 @@ int sysfs_merge_group(struct kobject *kobj,
kobject_get_ownership(kobj, &uid, &gid);
for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- (*attr)->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode,
+ uid, gid, NULL);
if (error) {
while (--i >= 0)
kernfs_remove_by_name(parent, (*--attr)->name);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e747c135c1d1..98467bb76737 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -103,7 +103,7 @@ int __init sysfs_init(void)
if (IS_ERR(sysfs_root))
return PTR_ERR(sysfs_root);
- sysfs_root_kn = sysfs_root->kn;
+ sysfs_root_kn = kernfs_root_to_node(sysfs_root);
err = register_filesystem(&sysfs_fs_type);
if (err) {
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0050cc0c0236..3f28c9af5756 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,9 +28,11 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
* file.c
*/
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t amode, kuid_t uid, kgid_t gid,
- const void *ns);
+ const struct attribute *attr, umode_t amode, kuid_t uid,
+ kgid_t gid, const void *ns);
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns);
/*
* symlink.c
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index be47263b8605..9e8d4a6fb2f3 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -306,7 +306,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
{
struct sysv_inode_info *si;
- si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
+ si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
if (!si)
return NULL;
return &si->vfs_inode;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 749385015a8d..409ab5e17803 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -495,7 +495,8 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations sysv_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = sysv_readpage,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index cc8e2ed155c8..d1def0771a40 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -474,10 +474,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
struct sysv_sb_info *sbi;
struct buffer_head *bh;
- if (440 != sizeof (struct v7_super_block))
- panic("V7 FS: bad super-block size");
- if (64 != sizeof (struct sysv_inode))
- panic("sysv fs: bad i-node size");
+ BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
+ BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
if (!sbi)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1261e8b41edb..de7252715b12 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -109,12 +109,12 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
* also the directory that is being deleted.
*/
inode_unlock(inode);
- inode_unlock(dentry->d_inode);
+ inode_unlock(d_inode(dentry));
ret = tracefs_ops.rmdir(name);
inode_lock_nested(inode, I_MUTEX_PARENT);
- inode_lock(dentry->d_inode);
+ inode_lock(d_inode(dentry));
kfree(name);
@@ -161,6 +161,77 @@ struct tracefs_fs_info {
struct tracefs_mount_opts mount_opts;
};
+static void change_gid(struct dentry *dentry, kgid_t gid)
+{
+ if (!dentry->d_inode)
+ return;
+ dentry->d_inode->i_gid = gid;
+}
+
+/*
+ * Taken from d_walk, but without he need for handling renames.
+ * Nothing can be renamed while walking the list, as tracefs
+ * does not support renames. This is only called when mounting
+ * or remounting the file system, to set all the files to
+ * the given gid.
+ */
+static void set_gid(struct dentry *parent, kgid_t gid)
+{
+ struct dentry *this_parent;
+ struct list_head *next;
+
+ this_parent = parent;
+ spin_lock(&this_parent->d_lock);
+
+ change_gid(this_parent, gid);
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+ next = tmp->next;
+
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+ change_gid(dentry, gid);
+
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, _RET_IP_);
+ this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+ goto repeat;
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+ /*
+ * All done at this level ... ascend and resume the search.
+ */
+ rcu_read_lock();
+ascend:
+ if (this_parent != parent) {
+ struct dentry *child = this_parent;
+ this_parent = child->d_parent;
+
+ spin_unlock(&child->d_lock);
+ spin_lock(&this_parent->d_lock);
+
+ /* go into the first sibling still alive */
+ do {
+ next = child->d_child.next;
+ if (next == &this_parent->d_subdirs)
+ goto ascend;
+ child = list_entry(next, struct dentry, d_child);
+ } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
+ rcu_read_unlock();
+ goto resume;
+ }
+ rcu_read_unlock();
+ spin_unlock(&this_parent->d_lock);
+ return;
+}
+
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
@@ -212,14 +283,16 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
static int tracefs_apply_options(struct super_block *sb)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
- struct inode *inode = sb->s_root->d_inode;
+ struct inode *inode = d_inode(sb->s_root);
struct tracefs_mount_opts *opts = &fsi->mount_opts;
inode->i_mode &= ~S_IALLUGO;
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
return 0;
}
@@ -331,18 +404,18 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = tracefs_mount->mnt_root;
- inode_lock(parent->d_inode);
- if (unlikely(IS_DEADDIR(parent->d_inode)))
+ inode_lock(d_inode(parent));
+ if (unlikely(IS_DEADDIR(d_inode(parent))))
dentry = ERR_PTR(-ENOENT);
else
dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry) && dentry->d_inode) {
+ if (!IS_ERR(dentry) && d_inode(dentry)) {
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
if (IS_ERR(dentry)) {
- inode_unlock(parent->d_inode);
+ inode_unlock(d_inode(parent));
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
@@ -351,7 +424,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
static struct dentry *failed_creating(struct dentry *dentry)
{
- inode_unlock(dentry->d_parent->d_inode);
+ inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
return NULL;
@@ -359,7 +432,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- inode_unlock(dentry->d_parent->d_inode);
+ inode_unlock(d_inode(dentry->d_parent));
return dentry;
}
@@ -414,8 +487,10 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
inode->i_mode = mode;
inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
+ inode->i_uid = d_inode(dentry->d_parent)->i_uid;
+ inode->i_gid = d_inode(dentry->d_parent)->i_gid;
d_instantiate(dentry, inode);
- fsnotify_create(dentry->d_parent->d_inode, dentry);
+ fsnotify_create(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
@@ -432,15 +507,18 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* Do not set bits for OTH */
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
inode->i_op = ops;
inode->i_fop = &simple_dir_operations;
+ inode->i_uid = d_inode(dentry->d_parent)->i_uid;
+ inode->i_gid = d_inode(dentry->d_parent)->i_gid;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
- inc_nlink(dentry->d_parent->d_inode);
- fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ inc_nlink(d_inode(dentry->d_parent));
+ fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 5c4b845754a7..314c80b24a76 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -5,7 +5,7 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
-ubifs-y += misc.o
+ubifs-y += misc.o sysfs.o
ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..c57b46a352d8 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7c61d0ec0159..86151889548e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -349,20 +349,97 @@ out_budg:
return err;
}
-static int do_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int err;
+ umode_t mode = S_IFCHR | WHITEOUT_MODE;
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm;
+
+ /*
+ * Create an inode('nlink = 1') for whiteout without updating journal,
+ * let ubifs_jnl_rename() store it on flash to complete rename whiteout
+ * atomically.
+ */
+
+ dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+ dentry, mode, dir->i_ino);
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ return ERR_PTR(err);
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
+
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_inode;
+
+ /* The dir size is updated by do_rename. */
+ insert_inode_hash(inode);
+
+ return inode;
+
+out_inode:
+ make_bad_inode(inode);
+ iput(inode);
+out_free:
+ fscrypt_free_filename(&nm);
+ ubifs_err(c, "cannot create whiteout file, error %d", err);
+ return ERR_PTR(err);
+}
+
+/**
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+}
+
+/**
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
- struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
+ struct ubifs_inode *ui;
int err, instantiated = 0;
struct fscrypt_name nm;
/*
- * Budget request settings: new dirty inode, new direntry,
- * budget for dirtied inode will be released via writeback.
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ * Allocate budget separately for new dirtied inode, the budget will
+ * be released via writeback.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
@@ -392,42 +469,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
}
ui = ubifs_inode(inode);
- if (whiteout) {
- init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
- ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
- }
-
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&ui->ui_mutex);
insert_inode_hash(inode);
-
- if (whiteout) {
- mark_inode_dirty(inode);
- drop_nlink(inode);
- *whiteout = inode;
- } else {
- d_tmpfile(dentry, inode);
- }
+ d_tmpfile(dentry, inode);
ubifs_assert(c, ui->dirty);
instantiated = 1;
mutex_unlock(&ui->ui_mutex);
- mutex_lock(&dir_ui->ui_mutex);
+ lock_2_inodes(dir, inode);
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
- mutex_unlock(&dir_ui->ui_mutex);
+ unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
return 0;
out_cancel:
- mutex_unlock(&dir_ui->ui_mutex);
+ unlock_2_inodes(dir, inode);
out_inode:
make_bad_inode(inode);
if (!instantiated)
@@ -441,12 +506,6 @@ out_budg:
return err;
}
-static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- return do_tmpfile(dir, dentry, mode, NULL);
-}
-
/**
* vfs_dent_type - get VFS directory entry type.
* @type: UBIFS directory entry type
@@ -660,32 +719,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
return 0;
}
-/**
- * lock_2_inodes - a wrapper for locking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- *
- * We do not implement any tricks to guarantee strict lock ordering, because
- * VFS has already done it for us on the @i_mutex. So this is just a simple
- * wrapper function.
- */
-static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
- mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
- mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-}
-
-/**
- * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- */
-static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
- mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
- mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-}
-
static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
@@ -949,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct fscrypt_name nm;
/*
@@ -1207,7 +1241,7 @@ out_budg:
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
- * @inode4: fouth inode
+ * @inode4: fourth inode
*
* This function is used for 'ubifs_rename()' and @inode1 may be the same as
* @inode2 whereas @inode3 and @inode4 may be %NULL.
@@ -1233,7 +1267,7 @@ static void lock_4_inodes(struct inode *inode1, struct inode *inode2,
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
- * @inode4: fouth inode
+ * @inode4: fourth inode
*/
static void unlock_4_inodes(struct inode *inode1, struct inode *inode2,
struct inode *inode3, struct inode *inode4)
@@ -1264,17 +1298,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino = 3 };
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+ struct ubifs_budget_req wht_req;
struct timespec64 time;
unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
/*
- * Budget request settings: deletion direntry, new direntry, removing
- * the old inode, and changing old and new parent directory inodes.
+ * Budget request settings:
+ * req: deletion direntry, new direntry, removing the old inode,
+ * and changing old and new parent directory inodes.
+ *
+ * wht_req: new whiteout inode for RENAME_WHITEOUT.
*
- * However, this operation also marks the target inode as dirty and
- * does not write it, so we allocate budget for the target inode
- * separately.
+ * ino_req: marks the target inode as dirty and does not write it.
*/
dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
@@ -1331,20 +1367,44 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_release;
}
- err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
- if (err) {
+ /*
+ * The whiteout inode without dentry is pinned in memory,
+ * umount won't happen during rename process because we
+ * got parent dentry.
+ */
+ whiteout = create_whiteout(old_dir, old_dentry);
+ if (IS_ERR(whiteout)) {
+ err = PTR_ERR(whiteout);
kfree(dev);
goto out_release;
}
- spin_lock(&whiteout->i_lock);
- whiteout->i_state |= I_LINKABLE;
- spin_unlock(&whiteout->i_lock);
-
whiteout_ui = ubifs_inode(whiteout);
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
ubifs_assert(c, !whiteout_ui->dirty);
+
+ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+ wht_req.new_ino = 1;
+ wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8);
+ /*
+ * To avoid deadlock between space budget (holds ui_mutex and
+ * waits wb work) and writeback work(waits ui_mutex), do space
+ * budget before ubifs inodes locked.
+ */
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+ /*
+ * Whiteout inode can not be written on flash by
+ * ubifs_jnl_write_inode(), because it's neither
+ * dirty nor zero-nlink.
+ */
+ iput(whiteout);
+ goto out_release;
+ }
+
+ /* Add the old_dentry size to the old_dir size. */
+ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
@@ -1416,29 +1476,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
if (unlink && IS_SYNC(new_inode))
sync = 1;
- }
-
- if (whiteout) {
- struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
- .dirtied_ino_d = \
- ALIGN(ubifs_inode(whiteout)->data_len, 8) };
-
- err = ubifs_budget_space(c, &wht_req);
- if (err) {
- kfree(whiteout_ui->data);
- whiteout_ui->data_len = 0;
- iput(whiteout);
- goto out_release;
- }
-
- inc_nlink(whiteout);
- mark_inode_dirty(whiteout);
-
- spin_lock(&whiteout->i_lock);
- whiteout->i_state &= ~I_LINKABLE;
- spin_unlock(&whiteout->i_lock);
-
- iput(whiteout);
+ /*
+ * S_SYNC flag of whiteout inherits from the old_dir, and we
+ * have already checked the old dir inode. So there is no need
+ * to check whiteout.
+ */
}
err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
@@ -1449,6 +1491,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &req);
+ if (whiteout) {
+ ubifs_release_budget(c, &wht_req);
+ iput(whiteout);
+ }
+
mutex_lock(&old_inode_ui->ui_mutex);
release = old_inode_ui->dirty;
mark_inode_dirty_sync(old_inode);
@@ -1457,11 +1504,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (release)
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
- err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+ /*
+ * Rename finished here. Although old inode cannot be updated
+ * on flash, old ctime is not a big problem, don't return err
+ * code to userspace.
+ */
+ old_inode->i_sb->s_op->write_inode(old_inode, NULL);
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
- return err;
+ return 0;
out_cancel:
if (unlink) {
@@ -1482,11 +1534,11 @@ out_cancel:
inc_nlink(old_dir);
}
}
+ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
if (whiteout) {
- drop_nlink(whiteout);
+ ubifs_release_budget(c, &wht_req);
iput(whiteout);
}
- unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
out_release:
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..0383fbdc95ff 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1287,25 +1287,25 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ubifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- ubifs_assert(c, PagePrivate(page));
- if (offset || length < PAGE_SIZE)
- /* Partial page remains dirty */
+ ubifs_assert(c, folio_test_private(folio));
+ if (offset || length < folio_size(folio))
+ /* Partial folio remains dirty */
return;
- if (PageChecked(page))
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
- ClearPageChecked(page);
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
}
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -1445,18 +1445,18 @@ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
-static int ubifs_set_page_dirty(struct page *page)
+static bool ubifs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- int ret;
- struct inode *inode = page->mapping->host;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
+ bool ret;
+ struct ubifs_info *c = mapping->host->i_sb->s_fs_info;
- ret = __set_page_dirty_nobuffers(page);
+ ret = filemap_dirty_folio(mapping, folio);
/*
* An attempt to dirty a page without budgeting for it - should not
* happen.
*/
- ubifs_assert(c, ret == 0);
+ ubifs_assert(c, ret == false);
return ret;
}
@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -1646,8 +1646,8 @@ const struct address_space_operations ubifs_file_address_operations = {
.writepage = ubifs_writepage,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
- .invalidatepage = ubifs_invalidatepage,
- .set_page_dirty = ubifs_set_page_dirty,
+ .invalidate_folio = ubifs_invalidate_folio,
+ .dirty_folio = ubifs_dirty_folio,
#ifdef CONFIG_MIGRATION
.migratepage = ubifs_migrate_page,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index dc3e26e9ed7b..3134d070fcc0 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
for (i = 0; ; i++) {
int space_before, space_after;
+ /* Maybe continue after find and break before find */
+ lp.lnum = -1;
+
cond_resched();
/* Give the commit an opportunity to run */
@@ -753,8 +756,19 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
* caller instead of the original '-EAGAIN'.
*/
err = ubifs_return_leb(c, lp.lnum);
- if (err)
+ if (err) {
ret = err;
+ /*
+ * An LEB may always be "taken",
+ * so setting ubifs to read-only,
+ * and then executing sync wbuf will
+ * return -EROFS and enter the "out"
+ * error branch.
+ */
+ ubifs_ro_mode(c, ret);
+ }
+ /* Maybe double return LEB if goto out */
+ lp.lnum = -1;
break;
}
goto out;
@@ -843,7 +857,8 @@ out:
ubifs_wbuf_sync_nolock(wbuf);
ubifs_ro_mode(c, ret);
mutex_unlock(&wbuf->io_mutex);
- ubifs_return_leb(c, lp.lnum);
+ if (lp.lnum != -1)
+ ubifs_return_leb(c, lp.lnum);
return ret;
}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 00b61dba62b7..1607a3c76681 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -194,6 +194,24 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
return err;
}
+static void record_magic_error(struct ubifs_stats_info *stats)
+{
+ if (stats)
+ stats->magic_errors++;
+}
+
+static void record_node_error(struct ubifs_stats_info *stats)
+{
+ if (stats)
+ stats->node_errors++;
+}
+
+static void record_crc_error(struct ubifs_stats_info *stats)
+{
+ if (stats)
+ stats->crc_errors++;
+}
+
/**
* ubifs_check_node - check node.
* @c: UBIFS file-system description object
@@ -238,6 +256,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len,
if (!quiet)
ubifs_err(c, "bad magic %#08x, expected %#08x",
magic, UBIFS_NODE_MAGIC);
+ record_magic_error(c->stats);
err = -EUCLEAN;
goto out;
}
@@ -246,6 +265,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len,
if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
if (!quiet)
ubifs_err(c, "bad node type %d", type);
+ record_node_error(c->stats);
goto out;
}
@@ -270,6 +290,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len,
if (!quiet)
ubifs_err(c, "bad CRC: calculated %#08x, read %#08x",
crc, node_crc);
+ record_crc_error(c->stats);
err = -EUCLEAN;
goto out;
}
@@ -833,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index c6a863487780..71bcebe45f9c 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags)
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
- .dirtied_ino_d = ui->data_len };
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
err = ubifs_budget_space(c, &req);
if (err)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 8ea680dba61e..75dab0ae3939 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1207,9 +1207,9 @@ out_free:
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the re-name operation which may involve writing up
- * to 4 inodes and 2 directory entries. It marks the written inodes as clean
- * and returns zero on success. In case of failure, a negative error code is
- * returned.
+ * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes)
+ * and 2 directory entries. It marks the written inodes as clean and returns
+ * zero on success. In case of failure, a negative error code is returned.
*/
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
const struct inode *old_inode,
@@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
- int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
+ int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0;
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
- struct ubifs_inode *new_ui;
+ struct ubifs_inode *new_ui, *whiteout_ui;
u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
+ u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ];
u8 hash_dent1[UBIFS_HASH_ARR_SZ];
u8 hash_dent2[UBIFS_HASH_ARR_SZ];
@@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
} else
ilen = 0;
+ if (whiteout) {
+ whiteout_ui = ubifs_inode(whiteout);
+ ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex));
+ ubifs_assert(c, whiteout->i_nlink == 1);
+ ubifs_assert(c, !whiteout_ui->dirty);
+ wlen = UBIFS_INO_NODE_SZ;
+ wlen += whiteout_ui->data_len;
+ } else
+ wlen = 0;
+
aligned_dlen1 = ALIGN(dlen1, 8);
aligned_dlen2 = ALIGN(dlen2, 8);
- len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+ len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) +
+ ALIGN(wlen, 8) + ALIGN(plen, 8);
if (move)
len += plen;
@@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
p += ALIGN(ilen, 8);
}
+ if (whiteout) {
+ pack_inode(c, p, whiteout, 0);
+ err = ubifs_node_calc_hash(c, p, hash_whiteout_inode);
+ if (err)
+ goto out_release;
+
+ p += ALIGN(wlen, 8);
+ }
+
if (!move) {
pack_inode(c, p, old_dir, 1);
err = ubifs_node_calc_hash(c, p, hash_old_dir);
@@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (new_inode)
ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
new_inode->i_ino);
+ if (whiteout)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ whiteout->i_ino);
}
release_head(c, BASEHD);
@@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
if (err)
goto out_ro;
-
- ubifs_delete_orphan(c, whiteout->i_ino);
} else {
err = ubifs_add_dirt(c, lnum, dlen2);
if (err)
@@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
offs += ALIGN(ilen, 8);
}
+ if (whiteout) {
+ ino_key_init(c, &key, whiteout->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, wlen,
+ hash_whiteout_inode);
+ if (err)
+ goto out_ro;
+ offs += ALIGN(wlen, 8);
+ }
+
ino_key_init(c, &key, old_dir->i_ino);
err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
if (err)
@@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
new_ui->synced_i_size = new_ui->ui_size;
spin_unlock(&new_ui->ui_lock);
}
+ /*
+ * No need to mark whiteout inode clean.
+ * Whiteout doesn't have non-zero size, no need to update
+ * synced_i_size for whiteout_ui.
+ */
mark_inode_clean(c, ubifs_inode(old_dir));
if (move)
mark_inode_clean(c, ubifs_inode(new_dir));
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5260d3e531bb..4211e4456b1e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -106,7 +106,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
* property values should be @lp->free == @c->leb_size and
* @lp->dirty == 0, but that is not the case. The reason is that
* the LEB had been garbage collected before it became the bud,
- * and there was not commit inbetween. The garbage collector
+ * and there was no commit in between. The garbage collector
* resets the free and dirty space without recording it
* anywhere except lprops, so if there was no commit then
* lprops does not have that information.
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f0fb25727d96..bad67455215f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -268,7 +268,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
{
struct ubifs_inode *ui;
- ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+ ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS);
if (!ui)
return NULL;
@@ -1264,6 +1264,10 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
return err;
+ err = ubifs_sysfs_register(c);
+ if (err)
+ goto out_debugging;
+
err = check_volume_empty(c);
if (err)
goto out_free;
@@ -1367,7 +1371,7 @@ static int mount_ubifs(struct ubifs_info *c)
sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
if (!c->ro_mount) {
/* Create background thread */
- c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
@@ -1375,7 +1379,6 @@ static int mount_ubifs(struct ubifs_info *c)
c->bgt_name, err);
goto out_wbufs;
}
- wake_up_process(c->bgt);
}
err = ubifs_read_master(c);
@@ -1641,6 +1644,8 @@ out_free:
vfree(c->sbuf);
kfree(c->bottom_up_buf);
kfree(c->sup_node);
+ ubifs_sysfs_unregister(c);
+out_debugging:
ubifs_debugging_exit(c);
return err;
}
@@ -1684,6 +1689,7 @@ static void ubifs_umount(struct ubifs_info *c)
kfree(c->bottom_up_buf);
kfree(c->sup_node);
ubifs_debugging_exit(c);
+ ubifs_sysfs_unregister(c);
}
/**
@@ -1780,7 +1786,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
/* Create background thread */
- c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
@@ -1788,7 +1794,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
c->bgt_name, err);
goto out;
}
- wake_up_process(c->bgt);
c->orph_buf = vmalloc(c->leb_size);
if (!c->orph_buf) {
@@ -1853,7 +1858,6 @@ out:
kthread_stop(c->bgt);
c->bgt = NULL;
}
- free_wbufs(c);
kfree(c->write_reserve_buf);
c->write_reserve_buf = NULL;
vfree(c->ileb_buf);
@@ -2436,14 +2440,20 @@ static int __init ubifs_init(void)
dbg_debugfs_init();
+ err = ubifs_sysfs_init();
+ if (err)
+ goto out_dbg;
+
err = register_filesystem(&ubifs_fs_type);
if (err) {
pr_err("UBIFS error (pid %d): cannot register file system, error %d",
current->pid, err);
- goto out_dbg;
+ goto out_sysfs;
}
return 0;
+out_sysfs:
+ ubifs_sysfs_exit();
out_dbg:
dbg_debugfs_exit();
ubifs_compressors_exit();
@@ -2462,6 +2472,7 @@ static void __exit ubifs_exit(void)
WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0);
dbg_debugfs_exit();
+ ubifs_sysfs_exit();
ubifs_compressors_exit();
unregister_shrinker(&ubifs_shrinker_info);
diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c
new file mode 100644
index 000000000000..06ad8fa1fcfb
--- /dev/null
+++ b/fs/ubifs/sysfs.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2021 Cisco Systems
+ *
+ * Author: Stefan Schaeckeler
+ */
+
+
+#include <linux/fs.h>
+#include "ubifs.h"
+
+enum attr_id_t {
+ attr_errors_magic,
+ attr_errors_node,
+ attr_errors_crc,
+};
+
+struct ubifs_attr {
+ struct attribute attr;
+ enum attr_id_t attr_id;
+};
+
+#define UBIFS_ATTR(_name, _mode, _id) \
+static struct ubifs_attr ubifs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+
+#define UBIFS_ATTR_FUNC(_name, _mode) UBIFS_ATTR(_name, _mode, _name)
+
+UBIFS_ATTR_FUNC(errors_magic, 0444);
+UBIFS_ATTR_FUNC(errors_crc, 0444);
+UBIFS_ATTR_FUNC(errors_node, 0444);
+
+#define ATTR_LIST(name) (&ubifs_attr_##name.attr)
+
+static struct attribute *ubifs_attrs[] = {
+ ATTR_LIST(errors_magic),
+ ATTR_LIST(errors_node),
+ ATTR_LIST(errors_crc),
+ NULL,
+};
+ATTRIBUTE_GROUPS(ubifs);
+
+static ssize_t ubifs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ubifs_info *sbi = container_of(kobj, struct ubifs_info,
+ kobj);
+
+ struct ubifs_attr *a = container_of(attr, struct ubifs_attr, attr);
+
+ switch (a->attr_id) {
+ case attr_errors_magic:
+ return sysfs_emit(buf, "%u\n", sbi->stats->magic_errors);
+ case attr_errors_node:
+ return sysfs_emit(buf, "%u\n", sbi->stats->node_errors);
+ case attr_errors_crc:
+ return sysfs_emit(buf, "%u\n", sbi->stats->crc_errors);
+ }
+ return 0;
+};
+
+static void ubifs_sb_release(struct kobject *kobj)
+{
+ struct ubifs_info *c = container_of(kobj, struct ubifs_info, kobj);
+
+ complete(&c->kobj_unregister);
+}
+
+static const struct sysfs_ops ubifs_attr_ops = {
+ .show = ubifs_attr_show,
+};
+
+static struct kobj_type ubifs_sb_ktype = {
+ .default_groups = ubifs_groups,
+ .sysfs_ops = &ubifs_attr_ops,
+ .release = ubifs_sb_release,
+};
+
+static struct kobj_type ubifs_ktype = {
+ .sysfs_ops = &ubifs_attr_ops,
+};
+
+static struct kset ubifs_kset = {
+ .kobj = {.ktype = &ubifs_ktype},
+};
+
+int ubifs_sysfs_register(struct ubifs_info *c)
+{
+ int ret, n;
+ char dfs_dir_name[UBIFS_DFS_DIR_LEN+1];
+
+ c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL);
+ if (!c->stats) {
+ ret = -ENOMEM;
+ goto out_last;
+ }
+ n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
+ c->vi.ubi_num, c->vi.vol_id);
+
+ if (n > UBIFS_DFS_DIR_LEN) {
+ /* The array size is too small */
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ c->kobj.kset = &ubifs_kset;
+ init_completion(&c->kobj_unregister);
+
+ ret = kobject_init_and_add(&c->kobj, &ubifs_sb_ktype, NULL,
+ "%s", dfs_dir_name);
+ if (ret)
+ goto out_put;
+
+ return 0;
+
+out_put:
+ kobject_put(&c->kobj);
+ wait_for_completion(&c->kobj_unregister);
+out_free:
+ kfree(c->stats);
+out_last:
+ ubifs_err(c, "cannot create sysfs entry for ubifs%d_%d, error %d\n",
+ c->vi.ubi_num, c->vi.vol_id, ret);
+ return ret;
+}
+
+void ubifs_sysfs_unregister(struct ubifs_info *c)
+{
+ kobject_del(&c->kobj);
+ kobject_put(&c->kobj);
+ wait_for_completion(&c->kobj_unregister);
+
+ kfree(c->stats);
+}
+
+int __init ubifs_sysfs_init(void)
+{
+ int ret;
+
+ kobject_set_name(&ubifs_kset.kobj, "ubifs");
+ ubifs_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&ubifs_kset);
+
+ return ret;
+}
+
+void ubifs_sysfs_exit(void)
+{
+ kset_unregister(&ubifs_kset);
+}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c38066ce9ab0..008fa46ef61e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -27,6 +27,8 @@
#include <linux/security.h>
#include <linux/xattr.h>
#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <linux/completion.h>
#include <crypto/hash_info.h>
#include <crypto/hash.h>
#include <crypto/algapi.h>
@@ -156,6 +158,13 @@
#endif
/*
+ * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi"
+ * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
+ */
+#define UBIFS_DFS_DIR_NAME "ubi%d_%d"
+#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1)
+
+/*
* Lockdep classes for UBIFS inode @ui_mutex.
*/
enum {
@@ -372,7 +381,7 @@ struct ubifs_gced_idx_leb {
* @ui_mutex exists for two main reasons. At first it prevents inodes from
* being written back while UBIFS changing them, being in the middle of an VFS
* operation. This way UBIFS makes sure the inode fields are consistent. For
- * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and
* write-back must not write any of them before we have finished.
*
* The second reason is budgeting - UBIFS has to budget all operations. If an
@@ -990,6 +999,18 @@ struct ubifs_budg_info {
int dent_budget;
};
+/**
+ * ubifs_stats_info - per-FS statistics information.
+ * @magic_errors: number of bad magic numbers (will be reset with a new mount).
+ * @node_errors: number of bad nodes (will be reset with a new mount).
+ * @crc_errors: number of bad crcs (will be reset with a new mount).
+ */
+struct ubifs_stats_info {
+ unsigned int magic_errors;
+ unsigned int node_errors;
+ unsigned int crc_errors;
+};
+
struct ubifs_debug_info;
/**
@@ -1251,6 +1272,10 @@ struct ubifs_debug_info;
* @mount_opts: UBIFS-specific mount options
*
* @dbg: debugging-related information
+ * @stats: statistics exported over sysfs
+ *
+ * @kobj: kobject for /sys/fs/ubifs/
+ * @kobj_unregister: completion to unregister sysfs kobject
*/
struct ubifs_info {
struct super_block *vfs_sb;
@@ -1286,6 +1311,9 @@ struct ubifs_info {
spinlock_t cs_lock;
wait_queue_head_t cmt_wq;
+ struct kobject kobj;
+ struct completion kobj_unregister;
+
unsigned int big_lpt:1;
unsigned int space_fixup:1;
unsigned int double_hash:1;
@@ -1493,6 +1521,7 @@ struct ubifs_info {
struct ubifs_mount_opts mount_opts;
struct ubifs_debug_info *dbg;
+ struct ubifs_stats_info *stats;
};
extern struct list_head ubifs_infos;
@@ -2072,6 +2101,12 @@ void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len,
int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
void *out, int *out_len, int compr_type);
+/* sysfs.c */
+int ubifs_sysfs_init(void);
+void ubifs_sysfs_exit(void);
+int ubifs_sysfs_register(struct ubifs_info *c);
+void ubifs_sysfs_unregister(struct ubifs_info *c);
+
#include "debug.h"
#include "misc.h"
#include "key.h"
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 70abdfad2df1..42e3e551fa4c 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -31,6 +31,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bio.h>
+#include <linux/iversion.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -43,7 +44,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
struct fileIdentDesc *fi = NULL;
struct fileIdentDesc cfi;
udf_pblk_t block, iblock;
- loff_t nf_pos;
+ loff_t nf_pos, emit_pos = 0;
int flen;
unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
@@ -57,6 +58,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
int i, num, ret = 0;
struct extent_position epos = { NULL, 0, {0, 0} };
struct super_block *sb = dir->i_sb;
+ bool pos_valid = false;
if (ctx->pos == 0) {
if (!dir_emit_dot(file, ctx))
@@ -67,6 +69,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
if (nf_pos >= size)
goto out;
+ /*
+ * Something changed since last readdir (either lseek was called or dir
+ * changed)? We need to verify the position correctly points at the
+ * beginning of some dir entry so that the directory parsing code does
+ * not get confused. Since UDF does not have any reliable way of
+ * identifying beginning of dir entry (names are under user control),
+ * we need to scan the directory from the beginning.
+ */
+ if (!inode_eq_iversion(dir, file->f_version)) {
+ emit_pos = nf_pos;
+ nf_pos = 0;
+ } else {
+ pos_valid = true;
+ }
+
fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
if (!fname) {
ret = -ENOMEM;
@@ -122,13 +139,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
while (nf_pos < size) {
struct kernel_lb_addr tloc;
+ loff_t cur_pos = nf_pos;
- ctx->pos = (nf_pos >> 2) + 1;
+ /* Update file position only if we got past the current one */
+ if (nf_pos >= emit_pos) {
+ ctx->pos = (nf_pos >> 2) + 1;
+ pos_valid = true;
+ }
fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
&elen, &offset);
if (!fi)
goto out;
+ /* Still not at offset where user asked us to read from? */
+ if (cur_pos < emit_pos)
+ continue;
liu = le16_to_cpu(cfi.lengthOfImpUse);
lfi = cfi.lengthFileIdent;
@@ -186,8 +211,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
} /* end while */
ctx->pos = (nf_pos >> 2) + 1;
+ pos_valid = true;
out:
+ if (pos_valid)
+ file->f_version = inode_query_iversion(dir);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1baff8ddb754..0f6bf2504437 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,8 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
}
const struct address_space_operations udf_adinicb_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2ecf0e87660e..b5d611cee749 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -77,6 +77,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
GFP_KERNEL);
}
if (!iinfo->i_data) {
+ make_bad_inode(inode);
iput(inode);
return ERR_PTR(-ENOMEM);
}
@@ -86,6 +87,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
dinfo->i_location.partitionReferenceNum,
start, &err);
if (err) {
+ make_bad_inode(inode);
iput(inode);
return ERR_PTR(err);
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d6b7a50736b..ca4fa710e562 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -235,7 +235,8 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations udf_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_readpage,
.readahead = udf_readahead,
.writepage = udf_writepage,
@@ -258,10 +259,6 @@ int udf_expand_file_adinicb(struct inode *inode)
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
- struct writeback_control udf_wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- };
WARN_ON_ONCE(!inode_is_locked(inode));
if (!iinfo->i_lenAlloc) {
@@ -305,8 +302,10 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
/* from now on we have normal address_space methods */
inode->i_data.a_ops = &udf_aops;
+ set_page_dirty(page);
+ unlock_page(page);
up_write(&iinfo->i_data_sem);
- err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+ err = filemap_fdatawrite(inode->i_mapping);
if (err) {
/* Restore everything back so that we don't lose data... */
lock_page(page);
@@ -317,6 +316,7 @@ int udf_expand_file_adinicb(struct inode *inode)
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
inode->i_data.a_ops = &udf_adinicb_aops;
+ iinfo->i_lenAlloc = inode->i_size;
up_write(&iinfo->i_data_sem);
}
put_page(page);
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index f1094cdcd6cd..46d697172197 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
unsigned long udf_get_last_block(struct super_block *sb)
{
- struct block_device *bdev = sb->s_bdev;
- struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+ struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
unsigned long lblock = 0;
/*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
* Try using the device size...
*/
if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
- lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+ lblock = sb_bdev_nr_blocks(sb);
if (lblock)
return lblock - 1;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index caeef08efed2..b3d5f97f16cd 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/crc-itu-t.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
static inline int udf_match(int len1, const unsigned char *name1, int len2,
const unsigned char *name2)
@@ -74,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy(udf_get_fi_ident(sfi), fileident, lfi);
+ memcpy(sfi->impUse + liu, fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy(udf_get_fi_ident(sfi), fileident, -offset);
+ memcpy(sfi->impUse + liu, fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -87,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
+ memset(sfi->impUse + liu + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
+ memset(sfi->impUse + liu + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
@@ -134,6 +135,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
mark_buffer_dirty_inode(fibh->ebh, inode);
mark_buffer_dirty_inode(fibh->sbh, inode);
}
+ inode_inc_iversion(inode);
+
return 0;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b2d7c57d0688..4042d9739fb7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -57,6 +57,7 @@
#include <linux/crc-itu-t.h>
#include <linux/log2.h>
#include <asm/byteorder.h>
+#include <linux/iversion.h>
#include "udf_sb.h"
#include "udf_i.h"
@@ -135,7 +136,7 @@ static struct kmem_cache *udf_inode_cachep;
static struct inode *udf_alloc_inode(struct super_block *sb)
{
struct udf_inode_info *ei;
- ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -149,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
init_rwsem(&ei->i_data_sem);
ei->cached_extent.lstart = -1;
spin_lock_init(&ei->i_extent_cache_lock);
+ inode_set_iversion(&ei->vfs_inode, 1);
return &ei->vfs_inode;
}
@@ -1175,8 +1177,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
struct udf_inode_info *vati;
uint32_t pos;
struct virtualAllocationTable20 *vat20;
- sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ sector_t blocks = sb_bdev_nr_blocks(sb);
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
@@ -1838,8 +1839,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
int ret;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
- udf_fixed_to_variable(block) >=
- i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+ udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1901,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
- if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits)
+ if (last[i] >= sb_bdev_nr_blocks(sb))
continue;
ret = udf_check_anchor_block(sb, last[i], fileset);
if (ret != -EAGAIN) {
@@ -2475,7 +2474,6 @@ static unsigned int udf_count_free_table(struct super_block *sb,
unsigned int accum = 0;
uint32_t elen;
struct kernel_lb_addr eloc;
- int8_t etype;
struct extent_position epos;
mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
@@ -2483,7 +2481,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
epos.offset = sizeof(struct unallocSpaceEntry);
epos.bh = NULL;
- while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
+ while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1)
accum += (elen >> table->i_sb->s_blocksize_bits);
brelse(epos.bh);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ac628de69601..d0dda01620f0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,7 +526,8 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations ufs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 00a01471ea05..23377c1baed9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1443,7 +1443,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
{
struct ufs_inode_info *ei;
- ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore
index 361294571ab0..51cdf3fb4dd4 100644
--- a/fs/unicode/.gitignore
+++ b/fs/unicode/.gitignore
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
/mkutf8data
-/utf8data.h
+/utf8data.c
diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..da786a687fdc 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -3,12 +3,13 @@
# UTF-8 normalization
#
config UNICODE
- bool "UTF-8 normalization and casefolding support"
+ tristate "UTF-8 normalization and casefolding support"
help
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
- support.
+ support. If you say M here the large table of case foldings will
+ be a separate loadable module that gets requested only when a file
+ system actually use it.
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
depends on UNICODE
- default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index b88aecc86550..0e51c0025a16 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,15 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_UNICODE) += unicode.o
+ifneq ($(CONFIG_UNICODE),)
+obj-y += unicode.o
+endif
+obj-$(CONFIG_UNICODE) += utf8data.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
unicode-y := utf8-norm.o utf8-core.o
-$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8-data.o: $(obj)/utf8data.c
-# In the normal build, the checked-in utf8data.h is just shipped.
+# In the normal build, the checked-in utf8data.c is just shipped.
#
-# To generate utf8data.h from UCD, put *.txt files in this directory
+# To generate utf8data.c from UCD, put *.txt files in this directory
# and pass REGENERATE_UTF8DATA=1 from the command line.
ifdef REGENERATE_UTF8DATA
@@ -24,15 +27,15 @@ quiet_cmd_utf8data = GEN $@
-t $(srctree)/$(src)/NormalizationTest.txt \
-o $@
-$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
+$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
$(call if_changed,utf8data)
else
-$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
- $(call if_changed,shipped)
+$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE
+ $(call if_changed,copy)
endif
-targets += utf8data.h
+targets += utf8data.c
hostprogs += mkutf8data
diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
index ff2025ac5a32..bc1a7c8b5c8d 100644
--- a/fs/unicode/mkutf8data.c
+++ b/fs/unicode/mkutf8data.c
@@ -3287,12 +3287,10 @@ static void write_file(void)
open_fail(utf8_name, errno);
fprintf(file, "/* This file is generated code, do not edit. */\n");
- fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
- fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
- fprintf(file, "#endif\n");
fprintf(file, "\n");
- fprintf(file, "static const unsigned int utf8vers = %#x;\n",
- unicode_maxage);
+ fprintf(file, "#include <linux/module.h>\n");
+ fprintf(file, "#include <linux/kernel.h>\n");
+ fprintf(file, "#include \"utf8n.h\"\n");
fprintf(file, "\n");
fprintf(file, "static const unsigned int utf8agetab[] = {\n");
for (i = 0; i != ages_count; i++)
@@ -3339,6 +3337,22 @@ static void write_file(void)
fprintf(file, "\n");
}
fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "struct utf8data_table utf8_data_table = {\n");
+ fprintf(file, "\t.utf8agetab = utf8agetab,\n");
+ fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
+ fprintf(file, "\n");
+ fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
+ fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
+ fprintf(file, "\n");
+ fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
+ fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
+ fprintf(file, "\n");
+ fprintf(file, "\t.utf8data = utf8data,\n");
+ fprintf(file, "};\n");
+ fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
+ fprintf(file, "\n");
+ fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
fclose(file);
}
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index dc25823bfed9..67aaadc3ab07 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -5,16 +5,13 @@
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/errno.h>
-#include <linux/unicode.h>
#include <linux/stringhash.h>
#include "utf8n.h"
int utf8_validate(const struct unicode_map *um, const struct qstr *str)
{
- const struct utf8data *data = utf8nfdi(um->version);
-
- if (utf8nlen(data, str->name, str->len) < 0)
+ if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0)
return -1;
return 0;
}
@@ -23,14 +20,13 @@ EXPORT_SYMBOL(utf8_validate);
int utf8_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0)
return -EINVAL;
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0)
return -EINVAL;
do {
@@ -50,14 +46,13 @@ EXPORT_SYMBOL(utf8_strncmp);
int utf8_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
return -EINVAL;
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0)
return -EINVAL;
do {
@@ -81,12 +76,11 @@ int utf8_strncasecmp_folded(const struct unicode_map *um,
const struct qstr *cf,
const struct qstr *s1)
{
- const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1;
int c1, c2;
int i = 0;
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
return -EINVAL;
do {
@@ -105,11 +99,10 @@ EXPORT_SYMBOL(utf8_strncasecmp_folded);
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
size_t nlen = 0;
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
return -EINVAL;
for (nlen = 0; nlen < dlen; nlen++) {
@@ -128,12 +121,11 @@ EXPORT_SYMBOL(utf8_casefold);
int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
struct qstr *str)
{
- const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
int c;
unsigned long hash = init_name_hash(salt);
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
return -EINVAL;
while ((c = utf8byte(&cur))) {
@@ -149,11 +141,10 @@ EXPORT_SYMBOL(utf8_casefold_hash);
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur;
ssize_t nlen = 0;
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0)
return -EINVAL;
for (nlen = 0; nlen < dlen; nlen++) {
@@ -167,69 +158,59 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
-
EXPORT_SYMBOL(utf8_normalize);
-static int utf8_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+static const struct utf8data *find_table_version(const struct utf8data *table,
+ size_t nr_entries, unsigned int version)
{
- substring_t args[3];
- char version_string[12];
- static const struct match_token token[] = {
- {1, "%d.%d.%d"},
- {0, NULL}
- };
-
- strncpy(version_string, version, sizeof(version_string));
-
- if (match_token(version_string, token, args) != 1)
- return -EINVAL;
-
- if (match_int(&args[0], maj) || match_int(&args[1], min) ||
- match_int(&args[2], rev))
- return -EINVAL;
+ size_t i = nr_entries - 1;
- return 0;
+ while (version < table[i].maxage)
+ i--;
+ if (version > table[i].maxage)
+ return NULL;
+ return &table[i];
}
-struct unicode_map *utf8_load(const char *version)
+struct unicode_map *utf8_load(unsigned int version)
{
- struct unicode_map *um = NULL;
- int unicode_version;
-
- if (version) {
- unsigned int maj, min, rev;
-
- if (utf8_parse_version(version, &maj, &min, &rev) < 0)
- return ERR_PTR(-EINVAL);
-
- if (!utf8version_is_supported(maj, min, rev))
- return ERR_PTR(-EINVAL);
-
- unicode_version = UNICODE_AGE(maj, min, rev);
- } else {
- unicode_version = utf8version_latest();
- printk(KERN_WARNING"UTF-8 version not specified. "
- "Assuming latest supported version (%d.%d.%d).",
- (unicode_version >> 16) & 0xff,
- (unicode_version >> 8) & 0xff,
- (unicode_version & 0xff));
- }
+ struct unicode_map *um;
um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
if (!um)
return ERR_PTR(-ENOMEM);
-
- um->charset = "UTF-8";
- um->version = unicode_version;
-
+ um->version = version;
+
+ um->tables = symbol_request(utf8_data_table);
+ if (!um->tables)
+ goto out_free_um;
+
+ if (!utf8version_is_supported(um, version))
+ goto out_symbol_put;
+ um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata,
+ um->tables->utf8nfdidata_size, um->version);
+ if (!um->ntab[UTF8_NFDI])
+ goto out_symbol_put;
+ um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata,
+ um->tables->utf8nfdicfdata_size, um->version);
+ if (!um->ntab[UTF8_NFDICF])
+ goto out_symbol_put;
return um;
+
+out_symbol_put:
+ symbol_put(um->tables);
+out_free_um:
+ kfree(um);
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL(utf8_load);
void utf8_unload(struct unicode_map *um)
{
- kfree(um);
+ if (um) {
+ symbol_put(utf8_data_table);
+ kfree(um);
+ }
}
EXPORT_SYMBOL(utf8_unload);
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 1d2d2e5b906a..768f8ab448b8 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -6,34 +6,17 @@
#include "utf8n.h"
-struct utf8data {
- unsigned int maxage;
- unsigned int offset;
-};
-
-#define __INCLUDED_FROM_UTF8NORM_C__
-#include "utf8data.h"
-#undef __INCLUDED_FROM_UTF8NORM_C__
-
-int utf8version_is_supported(u8 maj, u8 min, u8 rev)
+int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
{
- int i = ARRAY_SIZE(utf8agetab) - 1;
- unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
+ int i = um->tables->utf8agetab_size - 1;
- while (i >= 0 && utf8agetab[i] != 0) {
- if (sb_utf8version == utf8agetab[i])
+ while (i >= 0 && um->tables->utf8agetab[i] != 0) {
+ if (version == um->tables->utf8agetab[i])
return 1;
i--;
}
return 0;
}
-EXPORT_SYMBOL(utf8version_is_supported);
-
-int utf8version_latest(void)
-{
- return utf8vers;
-}
-EXPORT_SYMBOL(utf8version_latest);
/*
* UTF-8 valid ranges.
@@ -168,7 +151,7 @@ typedef const unsigned char utf8trie_t;
* underlying datatype: unsigned char.
*
* leaf[0]: The unicode version, stored as a generation number that is
- * an index into utf8agetab[]. With this we can filter code
+ * an index into ->utf8agetab[]. With this we can filter code
* points based on the unicode version in which they were
* defined. The CCC of a non-defined code point is 0.
* leaf[1]: Canonical Combining Class. During normalization, we need
@@ -316,21 +299,19 @@ utf8hangul(const char *str, unsigned char *hangul)
* is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode".
*/
-static utf8leaf_t *utf8nlookup(const struct utf8data *data,
- unsigned char *hangul, const char *s, size_t len)
+static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
+ enum utf8_normalization n, unsigned char *hangul, const char *s,
+ size_t len)
{
- utf8trie_t *trie = NULL;
+ utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset;
int offlen;
int offset;
int mask;
int node;
- if (!data)
- return NULL;
if (len == 0)
return NULL;
- trie = utf8data + data->offset;
node = 1;
while (node) {
offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
@@ -392,172 +373,29 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data,
*
* Forwards to utf8nlookup().
*/
-static utf8leaf_t *utf8lookup(const struct utf8data *data,
- unsigned char *hangul, const char *s)
+static utf8leaf_t *utf8lookup(const struct unicode_map *um,
+ enum utf8_normalization n, unsigned char *hangul, const char *s)
{
- return utf8nlookup(data, hangul, s, (size_t)-1);
-}
-
-/*
- * Maximum age of any character in s.
- * Return -1 if s is not valid UTF-8 unicode.
- * Return 0 if only non-assigned code points are used.
- */
-int utf8agemax(const struct utf8data *data, const char *s)
-{
- utf8leaf_t *leaf;
- int age = 0;
- int leaf_age;
- unsigned char hangul[UTF8HANGULLEAF];
-
- if (!data)
- return -1;
-
- while (*s) {
- leaf = utf8lookup(data, hangul, s);
- if (!leaf)
- return -1;
-
- leaf_age = utf8agetab[LEAF_GEN(leaf)];
- if (leaf_age <= data->maxage && leaf_age > age)
- age = leaf_age;
- s += utf8clen(s);
- }
- return age;
+ return utf8nlookup(um, n, hangul, s, (size_t)-1);
}
-EXPORT_SYMBOL(utf8agemax);
-
-/*
- * Minimum age of any character in s.
- * Return -1 if s is not valid UTF-8 unicode.
- * Return 0 if non-assigned code points are used.
- */
-int utf8agemin(const struct utf8data *data, const char *s)
-{
- utf8leaf_t *leaf;
- int age;
- int leaf_age;
- unsigned char hangul[UTF8HANGULLEAF];
-
- if (!data)
- return -1;
- age = data->maxage;
- while (*s) {
- leaf = utf8lookup(data, hangul, s);
- if (!leaf)
- return -1;
- leaf_age = utf8agetab[LEAF_GEN(leaf)];
- if (leaf_age <= data->maxage && leaf_age < age)
- age = leaf_age;
- s += utf8clen(s);
- }
- return age;
-}
-EXPORT_SYMBOL(utf8agemin);
-
-/*
- * Maximum age of any character in s, touch at most len bytes.
- * Return -1 if s is not valid UTF-8 unicode.
- */
-int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
-{
- utf8leaf_t *leaf;
- int age = 0;
- int leaf_age;
- unsigned char hangul[UTF8HANGULLEAF];
-
- if (!data)
- return -1;
-
- while (len && *s) {
- leaf = utf8nlookup(data, hangul, s, len);
- if (!leaf)
- return -1;
- leaf_age = utf8agetab[LEAF_GEN(leaf)];
- if (leaf_age <= data->maxage && leaf_age > age)
- age = leaf_age;
- len -= utf8clen(s);
- s += utf8clen(s);
- }
- return age;
-}
-EXPORT_SYMBOL(utf8nagemax);
-
-/*
- * Maximum age of any character in s, touch at most len bytes.
- * Return -1 if s is not valid UTF-8 unicode.
- */
-int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
-{
- utf8leaf_t *leaf;
- int leaf_age;
- int age;
- unsigned char hangul[UTF8HANGULLEAF];
-
- if (!data)
- return -1;
- age = data->maxage;
- while (len && *s) {
- leaf = utf8nlookup(data, hangul, s, len);
- if (!leaf)
- return -1;
- leaf_age = utf8agetab[LEAF_GEN(leaf)];
- if (leaf_age <= data->maxage && leaf_age < age)
- age = leaf_age;
- len -= utf8clen(s);
- s += utf8clen(s);
- }
- return age;
-}
-EXPORT_SYMBOL(utf8nagemin);
-
-/*
- * Length of the normalization of s.
- * Return -1 if s is not valid UTF-8 unicode.
- *
- * A string of Default_Ignorable_Code_Point has length 0.
- */
-ssize_t utf8len(const struct utf8data *data, const char *s)
-{
- utf8leaf_t *leaf;
- size_t ret = 0;
- unsigned char hangul[UTF8HANGULLEAF];
-
- if (!data)
- return -1;
- while (*s) {
- leaf = utf8lookup(data, hangul, s);
- if (!leaf)
- return -1;
- if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
- ret += utf8clen(s);
- else if (LEAF_CCC(leaf) == DECOMPOSE)
- ret += strlen(LEAF_STR(leaf));
- else
- ret += utf8clen(s);
- s += utf8clen(s);
- }
- return ret;
-}
-EXPORT_SYMBOL(utf8len);
/*
* Length of the normalization of s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
-ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
+ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
+ const char *s, size_t len)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
- if (!data)
- return -1;
while (len && *s) {
- leaf = utf8nlookup(data, hangul, s, len);
+ leaf = utf8nlookup(um, n, hangul, s, len);
if (!leaf)
return -1;
- if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+ if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
+ um->ntab[n]->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
@@ -568,7 +406,6 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
}
return ret;
}
-EXPORT_SYMBOL(utf8nlen);
/*
* Set up an utf8cursor for use by utf8byte().
@@ -580,14 +417,13 @@ EXPORT_SYMBOL(utf8nlen);
*
* Returns -1 on error, 0 on success.
*/
-int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
- const char *s, size_t len)
+int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
+ enum utf8_normalization n, const char *s, size_t len)
{
- if (!data)
- return -1;
if (!s)
return -1;
- u8c->data = data;
+ u8c->um = um;
+ u8c->n = n;
u8c->s = s;
u8c->p = NULL;
u8c->ss = NULL;
@@ -604,23 +440,6 @@ int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
return -1;
return 0;
}
-EXPORT_SYMBOL(utf8ncursor);
-
-/*
- * Set up an utf8cursor for use by utf8byte().
- *
- * u8c : pointer to cursor.
- * data : const struct utf8data to use for normalization.
- * s : NUL-terminated string.
- *
- * Returns -1 on error, 0 on success.
- */
-int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
- const char *s)
-{
- return utf8ncursor(u8c, data, s, (unsigned int)-1);
-}
-EXPORT_SYMBOL(utf8cursor);
/*
* Get one byte from the normalized form of the string described by u8c.
@@ -678,9 +497,9 @@ int utf8byte(struct utf8cursor *u8c)
/* Look up the data for the current character. */
if (u8c->p) {
- leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
+ leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
} else {
- leaf = utf8nlookup(u8c->data, u8c->hangul,
+ leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
u8c->s, u8c->len);
}
@@ -690,7 +509,8 @@ int utf8byte(struct utf8cursor *u8c)
ccc = LEAF_CCC(leaf);
/* Characters that are too new have CCC 0. */
- if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
+ if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
+ u8c->um->ntab[u8c->n]->maxage) {
ccc = STOPPER;
} else if (ccc == DECOMPOSE) {
u8c->len -= utf8clen(u8c->s);
@@ -704,7 +524,7 @@ int utf8byte(struct utf8cursor *u8c)
goto ccc_mismatch;
}
- leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
+ leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
if (!leaf)
return -1;
ccc = LEAF_CCC(leaf);
@@ -765,28 +585,10 @@ ccc_mismatch:
}
}
}
-EXPORT_SYMBOL(utf8byte);
-
-const struct utf8data *utf8nfdi(unsigned int maxage)
-{
- int i = ARRAY_SIZE(utf8nfdidata) - 1;
-
- while (maxage < utf8nfdidata[i].maxage)
- i--;
- if (maxage > utf8nfdidata[i].maxage)
- return NULL;
- return &utf8nfdidata[i];
-}
-EXPORT_SYMBOL(utf8nfdi);
-
-const struct utf8data *utf8nfdicf(unsigned int maxage)
-{
- int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
- while (maxage < utf8nfdicfdata[i].maxage)
- i--;
- if (maxage > utf8nfdicfdata[i].maxage)
- return NULL;
- return &utf8nfdicfdata[i];
-}
-EXPORT_SYMBOL(utf8nfdicf);
+#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
+EXPORT_SYMBOL_GPL(utf8version_is_supported);
+EXPORT_SYMBOL_GPL(utf8nlen);
+EXPORT_SYMBOL_GPL(utf8ncursor);
+EXPORT_SYMBOL_GPL(utf8byte);
+#endif
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 6fe8af7edccb..eb2bbdd688d7 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -18,9 +18,7 @@ unsigned int failed_tests;
unsigned int total_tests;
/* Tests will be based on this version. */
-#define latest_maj 12
-#define latest_min 1
-#define latest_rev 0
+#define UTF8_LATEST UNICODE_AGE(12, 1, 0)
#define _test(cond, func, line, fmt, ...) do { \
total_tests++; \
@@ -160,18 +158,22 @@ static const struct {
}
};
-static void check_utf8_nfdi(void)
+static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
+ const char *s)
+{
+ return utf8nlen(um, n, s, (size_t)-1);
+}
+
+static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
+ enum utf8_normalization n, const char *s)
+{
+ return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
+}
+
+static void check_utf8_nfdi(struct unicode_map *um)
{
int i;
struct utf8cursor u8c;
- const struct utf8data *data;
-
- data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
- if (!data) {
- pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
- __func__, latest_maj, latest_min, latest_rev);
- return;
- }
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
int len = strlen(nfdi_test_data[i].str);
@@ -179,10 +181,11 @@ static void check_utf8_nfdi(void)
int j = 0;
unsigned char c;
- test((utf8len(data, nfdi_test_data[i].str) == nlen));
- test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
+ test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
+ test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
+ nlen));
- if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
+ if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
pr_err("can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
@@ -196,18 +199,10 @@ static void check_utf8_nfdi(void)
}
}
-static void check_utf8_nfdicf(void)
+static void check_utf8_nfdicf(struct unicode_map *um)
{
int i;
struct utf8cursor u8c;
- const struct utf8data *data;
-
- data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
- if (!data) {
- pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
- __func__, latest_maj, latest_min, latest_rev);
- return;
- }
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
int len = strlen(nfdicf_test_data[i].str);
@@ -215,10 +210,13 @@ static void check_utf8_nfdicf(void)
int j = 0;
unsigned char c;
- test((utf8len(data, nfdicf_test_data[i].str) == nlen));
- test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
+ test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
+ nlen));
+ test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
+ nlen));
- if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
+ if (utf8cursor(&u8c, um, UTF8_NFDICF,
+ nfdicf_test_data[i].str) < 0)
pr_err("can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
@@ -232,16 +230,9 @@ static void check_utf8_nfdicf(void)
}
}
-static void check_utf8_comparisons(void)
+static void check_utf8_comparisons(struct unicode_map *table)
{
int i;
- struct unicode_map *table = utf8_load("12.1.0");
-
- if (IS_ERR(table)) {
- pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
- __func__, latest_maj, latest_min, latest_rev);
- return;
- }
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
const struct qstr s1 = {.name = nfdi_test_data[i].str,
@@ -262,42 +253,49 @@ static void check_utf8_comparisons(void)
test_f(!utf8_strncasecmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}
-
- utf8_unload(table);
}
-static void check_supported_versions(void)
+static void check_supported_versions(struct unicode_map *um)
{
/* Unicode 7.0.0 should be supported. */
- test(utf8version_is_supported(7, 0, 0));
+ test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
/* Unicode 9.0.0 should be supported. */
- test(utf8version_is_supported(9, 0, 0));
+ test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
/* Unicode 1x.0.0 (the latest version) should be supported. */
- test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
+ test(utf8version_is_supported(um, UTF8_LATEST));
/* Next versions don't exist. */
- test(!utf8version_is_supported(13, 0, 0));
- test(!utf8version_is_supported(0, 0, 0));
- test(!utf8version_is_supported(-1, -1, -1));
+ test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
+ test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
+ test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
}
static int __init init_test_ucd(void)
{
+ struct unicode_map *um;
+
failed_tests = 0;
total_tests = 0;
- check_supported_versions();
- check_utf8_nfdi();
- check_utf8_nfdicf();
- check_utf8_comparisons();
+ um = utf8_load(UTF8_LATEST);
+ if (IS_ERR(um)) {
+ pr_err("%s: Unable to load utf8 table.\n", __func__);
+ return PTR_ERR(um);
+ }
+
+ check_supported_versions(um);
+ check_utf8_nfdi(um);
+ check_utf8_nfdicf(um);
+ check_utf8_comparisons(um);
if (!failed_tests)
pr_info("All %u tests passed\n", total_tests);
else
pr_err("%u out of %u tests failed\n", failed_tests,
total_tests);
+ utf8_unload(um);
return 0;
}
diff --git a/fs/unicode/utf8data.h_shipped b/fs/unicode/utf8data.c_shipped
index 76e4f0e1b089..d9b62901aa96 100644
--- a/fs/unicode/utf8data.h_shipped
+++ b/fs/unicode/utf8data.c_shipped
@@ -1,9 +1,8 @@
/* This file is generated code, do not edit. */
-#ifndef __INCLUDED_FROM_UTF8NORM_C__
-#error Only nls_utf8-norm.c should include this file.
-#endif
-static const unsigned int utf8vers = 0xc0100;
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include "utf8n.h"
static const unsigned int utf8agetab[] = {
0,
@@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = {
0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00,
0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00
};
+
+struct utf8data_table utf8_data_table = {
+ .utf8agetab = utf8agetab,
+ .utf8agetab_size = ARRAY_SIZE(utf8agetab),
+
+ .utf8nfdicfdata = utf8nfdicfdata,
+ .utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),
+
+ .utf8nfdidata = utf8nfdidata,
+ .utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),
+
+ .utf8data = utf8data,
+};
+EXPORT_SYMBOL_GPL(utf8_data_table);
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index 0acd530c2c79..bd00d587747a 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -11,53 +11,9 @@
#include <linux/export.h>
#include <linux/string.h>
#include <linux/module.h>
+#include <linux/unicode.h>
-/* Encoding a unicode version number as a single unsigned int. */
-#define UNICODE_MAJ_SHIFT (16)
-#define UNICODE_MIN_SHIFT (8)
-
-#define UNICODE_AGE(MAJ, MIN, REV) \
- (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
- ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
- ((unsigned int)(REV)))
-
-/* Highest unicode version supported by the data tables. */
-extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
-extern int utf8version_latest(void);
-
-/*
- * Look for the correct const struct utf8data for a unicode version.
- * Returns NULL if the version requested is too new.
- *
- * Two normalization forms are supported: nfdi and nfdicf.
- *
- * nfdi:
- * - Apply unicode normalization form NFD.
- * - Remove any Default_Ignorable_Code_Point.
- *
- * nfdicf:
- * - Apply unicode normalization form NFD.
- * - Remove any Default_Ignorable_Code_Point.
- * - Apply a full casefold (C + F).
- */
-extern const struct utf8data *utf8nfdi(unsigned int maxage);
-extern const struct utf8data *utf8nfdicf(unsigned int maxage);
-
-/*
- * Determine the maximum age of any unicode character in the string.
- * Returns 0 if only unassigned code points are present.
- * Returns -1 if the input is not valid UTF-8.
- */
-extern int utf8agemax(const struct utf8data *data, const char *s);
-extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
-
-/*
- * Determine the minimum age of any unicode character in the string.
- * Returns 0 if any unassigned code points are present.
- * Returns -1 if the input is not valid UTF-8.
- */
-extern int utf8agemin(const struct utf8data *data, const char *s);
-extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
+int utf8version_is_supported(const struct unicode_map *um, unsigned int version);
/*
* Determine the length of the normalized from of the string,
@@ -65,8 +21,8 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
* Returns 0 if only ignorable code points are present.
* Returns -1 if the input is not valid UTF-8.
*/
-extern ssize_t utf8len(const struct utf8data *data, const char *s);
-extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
+ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
+ const char *s, size_t len);
/* Needed in struct utf8cursor below. */
#define UTF8HANGULLEAF (12)
@@ -75,7 +31,8 @@ extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
* Cursor structure used by the normalizer.
*/
struct utf8cursor {
- const struct utf8data *data;
+ const struct unicode_map *um;
+ enum utf8_normalization n;
const char *s;
const char *p;
const char *ss;
@@ -92,10 +49,8 @@ struct utf8cursor {
* Returns 0 on success.
* Returns -1 on failure.
*/
-extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
- const char *s);
-extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
- const char *s, size_t len);
+int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
+ enum utf8_normalization n, const char *s, size_t len);
/*
* Get the next byte in the normalization.
@@ -105,4 +60,24 @@ extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
*/
extern int utf8byte(struct utf8cursor *u8c);
+struct utf8data {
+ unsigned int maxage;
+ unsigned int offset;
+};
+
+struct utf8data_table {
+ const unsigned int *utf8agetab;
+ int utf8agetab_size;
+
+ const struct utf8data *utf8nfdicfdata;
+ int utf8nfdicfdata_size;
+
+ const struct utf8data *utf8nfdidata;
+ int utf8nfdidata_size;
+
+ const unsigned char *utf8data;
+};
+
+extern struct utf8data_table utf8_data_table;
+
#endif /* UTF8NORM_H */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 22bf14ab2d16..aa0c47cb0d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -15,6 +15,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
@@ -197,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
struct uffd_msg msg;
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
+
+ if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+ address &= PAGE_MASK;
msg.arg.pagefault.address = address;
/*
* These flags indicate why the userfault occurred:
@@ -481,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -877,7 +881,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev)
vma = prev;
else
@@ -1436,7 +1440,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }));
+ ((struct vm_userfaultfd_ctx){ ctx }),
+ anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1613,7 +1618,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 864c2fad23be..d74e0d336995 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -354,7 +354,7 @@ out:
const struct address_space_operations vboxsf_reg_aops = {
.readpage = vboxsf_readpage,
.writepage = vboxsf_writepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
.write_end = vboxsf_write_end,
};
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 37dd3fe5b1e9..d2f6df69f611 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -241,7 +241,7 @@ static struct inode *vboxsf_alloc_inode(struct super_block *sb)
{
struct vboxsf_inode *sf_i;
- sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS);
+ sf_i = alloc_inode_sb(sb, vboxsf_inode_cachep, GFP_NOFS);
if (!sf_i)
return NULL;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index aec2ebf7d25a..e1db0f3f7e5e 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -9,6 +9,7 @@
#include <linux/namei.h>
#include <linux/nls.h>
#include <linux/sizes.h>
+#include <linux/pagemap.h>
#include <linux/vfs.h>
#include "vfsmod.h"
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 0adb970f4e73..14e2fb49cff5 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Data verification functions, i.e. hooks for ->readpages()
+ * Data verification functions, i.e. hooks for ->readahead()
*
* Copyright 2019 Google LLC
*/
@@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
* that fail verification are set to the Error state. Verification is skipped
* for pages already in the Error state, e.g. due to fscrypt decryption failure.
*
- * This is a helper function for use by the ->readpages() method of filesystems
+ * This is a helper function for use by the ->readahead() method of filesystems
* that issue bios to read data directly into the page cache. Filesystems that
* populate the page cache without issuing bios (e.g. non block-based
* filesystems) must instead call fsverity_verify_page() directly on each page.
diff --git a/fs/xattr.c b/fs/xattr.c
index 5c8c5175b385..998045165916 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -569,7 +569,8 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
}
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
+ posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
+ kvalue, size);
}
error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
@@ -667,7 +668,8 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error > 0) {
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
+ posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
+ kvalue, error);
if (size && copy_to_user(value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 6f49bf39183c..c557a030acfe 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/backing-dev.h>
#include "xfs_message.h"
#include "xfs_trace.h"
@@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
current->comm, current->pid,
(unsigned int)size, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(lflags);
} while (1);
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 54da6d717a06..b987dc2c6851 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -72,10 +72,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
/*
* Zone interfaces
*/
-
-#define kmem_zone kmem_cache
-#define kmem_zone_t struct kmem_cache
-
static inline struct page *
kmem_to_page(void *addr)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 005abfd9fd34..1e4ee042d52f 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -248,6 +248,7 @@ xfs_initialize_perag(
spin_unlock(&mp->m_perag_lock);
radix_tree_preload_end();
+#ifdef __KERNEL__
/* Place kernel structure only init below this point. */
spin_lock_init(&pag->pag_ici_lock);
spin_lock_init(&pag->pagb_lock);
@@ -257,6 +258,7 @@ xfs_initialize_perag(
init_waitqueue_head(&pag->pagb_wait);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
+#endif /* __KERNEL__ */
error = xfs_buf_hash_init(pag);
if (error)
@@ -850,7 +852,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- __xfs_bmap_add_free(*tpp, args.fsbno, delta, NULL, true);
+ __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
/*
* Roll the transaction before trying to re-init the per-ag
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4c6f9045baca..e411d51c2589 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -64,6 +64,10 @@ struct xfs_perag {
/* Blocks reserved for the reverse mapping btree. */
struct xfs_ag_resv pag_rmapbt_resv;
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+
+#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
/*
@@ -90,9 +94,6 @@ struct xfs_perag {
spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
struct rhashtable pag_buf_hash;
- /* for rcu-safe freeing */
- struct rcu_head rcu_head;
-
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
@@ -102,6 +103,7 @@ struct xfs_perag {
* or have some other means to control concurrency.
*/
struct rhashtable pagi_unlinked_hash;
+#endif /* __KERNEL__ */
};
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
@@ -116,23 +118,29 @@ void xfs_perag_put(struct xfs_perag *pag);
/*
* Perag iteration APIs
- *
- * XXX: for_each_perag_range() usage really needs an iterator to clean up when
- * we terminate at end_agno because we may have taken a reference to the perag
- * beyond end_agno. Right now callers have to be careful to catch and clean that
- * up themselves. This is not necessary for the callers of for_each_perag() and
- * for_each_perag_from() because they terminate at sb_agcount where there are
- * no perag structures in tree beyond end_agno.
*/
-#define for_each_perag_range(mp, next_agno, end_agno, pag) \
- for ((pag) = xfs_perag_get((mp), (next_agno)); \
- (pag) != NULL && (next_agno) <= (end_agno); \
- (next_agno) = (pag)->pag_agno + 1, \
- xfs_perag_put(pag), \
- (pag) = xfs_perag_get((mp), (next_agno)))
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t end_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_put(pag);
+ if (*agno > end_agno)
+ return NULL;
+ return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
-#define for_each_perag_from(mp, next_agno, pag) \
- for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag))
+#define for_each_perag_from(mp, agno, pag) \
+ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
#define for_each_perag(mp, agno, pag) \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 2aa2b3484c28..fe94058d4e9e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -91,7 +91,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ return XFS_TEST_ERROR(avail < orig / 10 ||
+ avail < pag->pag_mount->m_agbtree_maxlevels,
pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 95157f5a5a6c..b52ed339727f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -27,7 +27,7 @@
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
-extern kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
@@ -82,6 +82,24 @@ xfs_prealloc_blocks(
}
/*
+ * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * guarantee that we can refill the AGFL prior to allocating space in a nearly
+ * full AG. Although the the space described by the free space btrees, the
+ * blocks used by the freesp btrees themselves, and the blocks owned by the
+ * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
+ * free space in the AG drop so low that the free space btrees cannot refill an
+ * empty AGFL up to the minimum level. Rather than grind through empty AGs
+ * until the fs goes down, we subtract this many AG blocks from the incore
+ * fdblocks to ensure user allocation does not overcommit the space the
+ * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
+ * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ */
+#define XFS_ALLOCBT_AGFL_RESERVE 4
+
+/*
+ * Compute the number of blocks that we set aside to guarantee the ability to
+ * refill the AGFL and handle a full bmap btree split.
+ *
* In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
* AGF buffer (PV 947395), we place constraints on the relationship among
* actual allocations for data blocks, freelist blocks, and potential file data
@@ -93,14 +111,14 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
- * potential split of the file's bmap btree.
+ * For each AG, we need to reserve enough blocks to replenish a totally empty
+ * AGFL and 4 more to handle a potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
+ return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
}
/*
@@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable(
unsigned int blocks;
blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
- blocks += XFS_ALLOC_AGFL_RESERVE;
+ blocks += XFS_ALLOCBT_AGFL_RESERVE;
blocks += 3; /* AGF, AGI btree root blocks */
if (xfs_has_finobt(mp))
blocks++; /* finobt root block */
if (xfs_has_rmapbt(mp))
- blocks++; /* rmap root block */
+ blocks++; /* rmap root block */
if (xfs_has_reflink(mp))
blocks++; /* refcount root block */
@@ -426,8 +444,8 @@ xfs_alloc_fix_len(
*/
STATIC int /* error code */
xfs_alloc_fixup_trees(
- xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
- xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
+ struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
+ struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
xfs_agblock_t fbno, /* starting block of free extent */
xfs_extlen_t flen, /* length of free extent */
xfs_agblock_t rbno, /* starting block of returned extent */
@@ -488,8 +506,8 @@ xfs_alloc_fixup_trees(
struct xfs_btree_block *bnoblock;
struct xfs_btree_block *cntblock;
- bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
@@ -1200,8 +1218,8 @@ xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
- xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+ struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
xfs_agblock_t fbno; /* start block of found extent */
xfs_extlen_t flen; /* length of found extent */
@@ -1512,7 +1530,7 @@ xfs_alloc_ag_vextent_lastblock(
* than minlen.
*/
if (*len || args->alignment > 1) {
- acur->cnt->bc_ptrs[0] = 1;
+ acur->cnt->bc_levels[0].ptr = 1;
do {
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
@@ -1658,8 +1676,8 @@ xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
+ struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
+ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
xfs_agblock_t fbno; /* start of found freespace */
xfs_extlen_t flen; /* length of found freespace */
@@ -2190,14 +2208,15 @@ xfs_free_ag_extent(
*/
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+ mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
(mp->m_sb.sb_agblocks + 1) / 2);
+ ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
}
/*
@@ -2255,14 +2274,14 @@ xfs_alloc_min_freelist(
const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
- ASSERT(mp->m_ag_maxlevels > 0);
+ ASSERT(mp->m_alloc_maxlevels > 0);
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
@@ -2439,7 +2458,7 @@ xfs_agfl_reset(
/*
* Defer an AGFL block free. This is effectively equivalent to
- * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
*
* Deferring AGFL frees helps prevent log reservation overruns due to too many
* allocation operations in a transaction. AGFL frees are prone to this problem
@@ -2458,21 +2477,74 @@ xfs_defer_agfl_block(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent_free_item *new; /* new element */
- ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
- new->xefi_oinfo = *oinfo;
- new->xefi_skip_discard = false;
+ new->xefi_owner = oinfo->oi_owner;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
}
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_extent_free_item *new; /* new element */
+#ifdef DEBUG
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_extfree_item_cache != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (skip_discard)
+ new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (oinfo) {
+ ASSERT(oinfo->oi_offset == 0);
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ new->xefi_owner = oinfo->oi_owner;
+ } else {
+ new->xefi_owner = XFS_RMAP_OWN_NULL;
+ }
+ trace_xfs_bmap_free_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -2903,13 +2975,16 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels)
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+ mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+ mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_rmapbt(mp) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels))
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+ mp->m_rmap_maxlevels))
return __this_address;
if (xfs_has_rmapbt(mp) &&
@@ -3495,3 +3570,20 @@ xfs_agfl_walk(
return 0;
}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+ xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
+
+ return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_extfree_item_cache);
+ xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index df4aefaf0046..d4c057b764f9 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
/* freespace limit calculations */
-#define XFS_ALLOC_AGFL_RESERVE 4
unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
@@ -98,7 +97,7 @@ unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
@@ -248,4 +247,40 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+ bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+ struct list_head xefi_list;
+ uint64_t xefi_owner;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ unsigned int xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ __xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache *xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6746fd735550..8c9f73cc0bee 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
xfs_allocbt_dup_cursor(
@@ -316,7 +317,7 @@ xfs_allocbt_verify(
if (pag && pag->pagf_init) {
if (level >= pag->pagf_levels[btnum])
return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ } else if (level >= mp->m_alloc_maxlevels)
return __this_address;
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
@@ -477,12 +478,8 @@ xfs_allocbt_init_common(
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+ xfs_allocbt_cur_cache);
cur->bc_ag.abt.active = false;
if (btnum == XFS_BTNUM_CNT) {
@@ -571,6 +568,17 @@ xfs_allocbt_commit_staged_btree(
}
}
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
/*
* Calculate number of records in an alloc btree block.
*/
@@ -581,10 +589,26 @@ xfs_allocbt_maxrecs(
int leaf)
{
blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+ return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_alloc_rec_t);
- return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
}
/* Calculate the freespace btree size for some records. */
@@ -595,3 +619,22 @@ xfs_allocbt_calc_size(
{
return xfs_btree_calc_size(mp->m_alloc_mnr, len);
}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+ xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+ xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_allocbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_allocbt_cur_cache);
+ xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 2f6b816aaf9f..45df893ef6bb 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -60,4 +60,9 @@ extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fbc9d816882c..23523b802539 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1077,21 +1077,18 @@ xfs_attr_node_hasname(
state = xfs_da_state_alloc(args);
if (statep != NULL)
- *statep = NULL;
+ *statep = state;
/*
* Search to see if name exists, and get back a pointer to it.
*/
error = xfs_da3_node_lookup_int(state, &retval);
- if (error) {
- xfs_da_state_free(state);
- return error;
- }
+ if (error)
+ retval = error;
- if (statep != NULL)
- *statep = state;
- else
+ if (!statep)
xfs_da_state_free(state);
+
return retval;
}
@@ -1112,7 +1109,7 @@ xfs_attr_node_addname_find_attr(
*/
retval = xfs_attr_node_hasname(args, &dac->da_state);
if (retval != -ENOATTR && retval != -EEXIST)
- return retval;
+ goto error;
if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
goto error;
@@ -1337,7 +1334,7 @@ int xfs_attr_node_removename_setup(
error = xfs_attr_node_hasname(args, state);
if (error != -EEXIST)
- return error;
+ goto out;
error = 0;
ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index e1d11e314228..014daa8c542d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -770,7 +770,7 @@ xfs_attr_fork_remove(
ASSERT(ip->i_afp->if_nextents == 0);
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b48230f1a361..74198dd82b03 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -37,8 +37,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
-
-kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_bmap_intent_cache;
/*
* Miscellaneous helper functions
@@ -93,6 +92,7 @@ xfs_bmap_compute_maxlevels(
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
}
mp->m_bm_maxlevels[whichfork] = level;
+ ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
}
unsigned int
@@ -239,11 +239,11 @@ xfs_bmap_get_bp(
if (!cur)
return NULL;
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
+ for (i = 0; i < cur->bc_maxlevels; i++) {
+ if (!cur->bc_levels[i].bp)
break;
- if (xfs_buf_daddr(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
+ if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+ return cur->bc_levels[i].bp;
}
/* Chase down all the log items to see if the bp is there */
@@ -316,7 +316,7 @@ xfs_check_block(
*/
STATIC void
xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
+ struct xfs_btree_cur *cur, /* btree cursor or null */
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
@@ -522,56 +522,6 @@ xfs_bmap_validate_ret(
#endif /* DEBUG */
/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-__xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
-
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
- if (oinfo)
- new->xefi_oinfo = *oinfo;
- else
- new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
- new->xefi_skip_discard = skip_discard;
- trace_xfs_bmap_free_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-}
-
-/*
* Inode fork format manipulation functions
*/
@@ -625,12 +575,12 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
+ if (cur->bc_levels[0].bp == cbp)
+ cur->bc_levels[0].bp = NULL;
xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
@@ -925,7 +875,7 @@ xfs_bmap_add_attrfork_btree(
int *flags) /* inode logging flags */
{
struct xfs_btree_block *block = ip->i_df.if_broot;
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
xfs_mount_t *mp; /* file system mount struct */
int stat; /* newroot status */
@@ -968,7 +918,7 @@ xfs_bmap_add_attrfork_extents(
struct xfs_inode *ip, /* incore inode pointer */
int *flags) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* bmap btree cursor */
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
int error; /* error return value */
if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
@@ -1988,11 +1938,11 @@ xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ struct xfs_btree_cur **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
int i; /* temp state */
struct xfs_ifork *ifp; /* inode fork pointer */
@@ -4601,7 +4551,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4648,7 +4598,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
@@ -5045,7 +4995,7 @@ xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t *cur, /* if null, not a btree */
+ struct xfs_btree_cur *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
@@ -5296,7 +5246,7 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_bmap_add_free(tp, del->br_startblock,
+ __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
@@ -6189,7 +6139,7 @@ __xfs_bmap_add(
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6300,3 +6250,20 @@ xfs_bmap_validate_extent(
return __this_address;
return NULL;
}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+ xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+ sizeof(struct xfs_bmap_intent),
+ 0, 0, NULL);
+
+ return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_bmap_intent_cache);
+ xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 67641f669918..03d9aaf87413 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -13,8 +13,6 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
-extern kmem_zone_t *xfs_bmap_free_item_zone;
-
/*
* Argument structure for xfs_bmap_alloc.
*/
@@ -44,19 +42,6 @@ struct xfs_bmalloca {
int flags;
};
-/*
- * List of extents to be free "later".
- * The list is kept sorted on xbf_startblock.
- */
-struct xfs_extent_free_item
-{
- xfs_fsblock_t xefi_startblock;/* starting fs block number */
- xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- bool xefi_skip_discard;
- struct list_head xefi_list;
- struct xfs_owner_info xefi_oinfo; /* extent owner */
-};
-
#define XFS_BMAP_MAX_NMAP 4
/*
@@ -189,9 +174,6 @@ unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
-void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -239,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
-static inline void
-xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo)
-{
- __xfs_bmap_add_free(tp, bno, len, oinfo, false);
-}
-
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -257,8 +229,8 @@ enum xfs_bmap_intent_type {
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
- struct xfs_inode *bi_owner;
int bi_whichfork;
+ struct xfs_inode *bi_owner;
struct xfs_bmbt_irec bi_bmap;
};
@@ -290,4 +262,9 @@ int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
int flags);
+extern struct kmem_cache *xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 72444b8b38a6..453309fc85f2 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_trace.h"
#include "xfs_rmap.h"
+static struct kmem_cache *xfs_bmbt_cur_cache;
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -286,7 +288,7 @@ xfs_bmbt_free_block(
struct xfs_owner_info oinfo;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
- xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -552,13 +554,9 @@ xfs_bmbt_init_cursor(
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+ mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_btnum = XFS_BTNUM_BMAP;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
@@ -575,6 +573,17 @@ xfs_bmbt_init_cursor(
return cur;
}
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
/*
* Calculate number of records in a bmap btree block.
*/
@@ -585,10 +594,24 @@ xfs_bmbt_maxrecs(
int leaf)
{
blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_bmbt_rec_t);
- return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+/* Compute the max possible height for block mapping btrees. */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+ /* One extra level for the inode root. */
+ return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
}
/*
@@ -654,3 +677,22 @@ xfs_bmbt_calc_size(
{
return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+ xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+ xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_bmbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_bmbt_cur_cache);
+ xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 729e3bc569be..3e7a40a83835 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -110,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 298395481713..c1500b238520 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,11 +22,11 @@
#include "xfs_log.h"
#include "xfs_btree_staging.h"
#include "xfs_ag.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t *xfs_btree_cur_zone;
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
/*
* Btree magic numbers.
@@ -367,8 +367,8 @@ xfs_btree_del_cursor(
* way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ if (cur->bc_levels[i].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
else if (!error)
break;
}
@@ -379,7 +379,7 @@ xfs_btree_del_cursor(
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
xfs_perag_put(cur->bc_ag.pag);
- kmem_cache_free(xfs_btree_cur_zone, cur);
+ kmem_cache_free(cur->bc_cache, cur);
}
/*
@@ -388,14 +388,14 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
struct xfs_buf *bp; /* btree block's buffer pointer */
int error; /* error return value */
int i; /* level number of btree block */
xfs_mount_t *mp; /* mount structure for filesystem */
- xfs_btree_cur_t *new; /* new cursor value */
+ struct xfs_btree_cur *new; /* new cursor value */
xfs_trans_t *tp; /* transaction pointer, can be NULL */
tp = cur->bc_tp;
@@ -415,9 +415,9 @@ xfs_btree_dup_cursor(
* For each level current, re-get the buffer and copy the ptr value.
*/
for (i = 0; i < new->bc_nlevels; i++) {
- new->bc_ptrs[i] = cur->bc_ptrs[i];
- new->bc_ra[i] = cur->bc_ra[i];
- bp = cur->bc_bufs[i];
+ new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+ new->bc_levels[i].ra = cur->bc_levels[i].ra;
+ bp = cur->bc_levels[i].bp;
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
xfs_buf_daddr(bp), mp->m_bsize,
@@ -429,7 +429,7 @@ xfs_btree_dup_cursor(
return error;
}
}
- new->bc_bufs[i] = bp;
+ new->bc_levels[i].bp = bp;
}
*ncur = new;
return 0;
@@ -681,7 +681,7 @@ xfs_btree_get_block(
return xfs_btree_get_iroot(cur);
}
- *bpp = cur->bc_bufs[level];
+ *bpp = cur->bc_levels[level].bp;
return XFS_BUF_TO_BLOCK(*bpp);
}
@@ -691,7 +691,7 @@ xfs_btree_get_block(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -711,7 +711,7 @@ xfs_btree_firstrec(
/*
* Set the ptr value to 1, that's the first record/key.
*/
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
return 1;
}
@@ -721,7 +721,7 @@ xfs_btree_firstrec(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -741,7 +741,7 @@ xfs_btree_lastrec(
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -922,11 +922,11 @@ xfs_btree_readahead(
(lev == cur->bc_nlevels - 1))
return 0;
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
return 0;
- cur->bc_ra[lev] |= lr;
- block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+ cur->bc_levels[lev].ra |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return xfs_btree_readahead_lblock(cur, lr, block);
@@ -985,28 +985,28 @@ xfs_btree_readahead_ptr(
*/
STATIC void
xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
struct xfs_buf *bp) /* new buffer to set */
{
struct xfs_btree_block *b; /* btree block */
- if (cur->bc_bufs[lev])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
- cur->bc_bufs[lev] = bp;
- cur->bc_ra[lev] = 0;
+ if (cur->bc_levels[lev].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+ cur->bc_levels[lev].bp = bp;
+ cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
} else {
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
}
}
@@ -1548,7 +1548,7 @@ xfs_btree_increment(
#endif
/* We're done if we remain in the block after the increment. */
- if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
goto out1;
/* Fail if we just went off the right edge of the tree. */
@@ -1571,7 +1571,7 @@ xfs_btree_increment(
goto error0;
#endif
- if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
break;
/* Read-ahead the right block for the next loop. */
@@ -1598,14 +1598,14 @@ xfs_btree_increment(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = 1;
+ cur->bc_levels[lev].ptr = 1;
}
out1:
*stat = 1;
@@ -1641,7 +1641,7 @@ xfs_btree_decrement(
xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
/* We're done if we remain in the block after the decrement. */
- if (--cur->bc_ptrs[level] > 0)
+ if (--cur->bc_levels[level].ptr > 0)
goto out1;
/* Get a pointer to the btree block. */
@@ -1665,7 +1665,7 @@ xfs_btree_decrement(
* Stop when we don't go off the left edge of a block.
*/
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
+ if (--cur->bc_levels[lev].ptr > 0)
break;
/* Read-ahead the left block for the next loop. */
xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
@@ -1691,13 +1691,13 @@ xfs_btree_decrement(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
}
out1:
*stat = 1;
@@ -1735,7 +1735,7 @@ xfs_btree_lookup_get_block(
*
* Otherwise throw it away and get a new one.
*/
- bp = cur->bc_bufs[level];
+ bp = cur->bc_levels[level].bp;
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error)
return error;
@@ -1864,7 +1864,7 @@ xfs_btree_lookup(
return -EFSCORRUPTED;
}
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
*stat = 0;
return 0;
}
@@ -1916,7 +1916,7 @@ xfs_btree_lookup(
if (error)
goto error0;
- cur->bc_ptrs[level] = keyno;
+ cur->bc_levels[level].ptr = keyno;
}
}
@@ -1933,7 +1933,7 @@ xfs_btree_lookup(
!xfs_btree_ptr_is_null(cur, &ptr)) {
int i;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
@@ -1944,7 +1944,7 @@ xfs_btree_lookup(
}
} else if (dir == XFS_LOOKUP_LE && diff > 0)
keyno--;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
@@ -2104,7 +2104,7 @@ __xfs_btree_updkeys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
nlkey = xfs_btree_key_addr(cur, ptr, block);
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
if (!force_all &&
@@ -2171,7 +2171,7 @@ xfs_btree_update_keys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
kp = xfs_btree_key_addr(cur, ptr, block);
xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -2205,7 +2205,7 @@ xfs_btree_update(
goto error0;
#endif
/* Get the address of the rec to be updated. */
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
rp = xfs_btree_rec_addr(cur, ptr, block);
/* Fill in the new contents and log them. */
@@ -2280,7 +2280,7 @@ xfs_btree_lshift(
* If the cursor entry is the one that would be moved, don't
* do it... it's too complicated.
*/
- if (cur->bc_ptrs[level] <= 1)
+ if (cur->bc_levels[level].ptr <= 1)
goto out0;
/* Set up the left neighbor as "left". */
@@ -2414,7 +2414,7 @@ xfs_btree_lshift(
goto error0;
/* Slide the cursor value left one. */
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
*stat = 1;
return 0;
@@ -2476,7 +2476,7 @@ xfs_btree_rshift(
* do it... it's too complicated.
*/
lrecs = xfs_btree_get_numrecs(left);
- if (cur->bc_ptrs[level] >= lrecs)
+ if (cur->bc_levels[level].ptr >= lrecs)
goto out0;
/* Set up the right neighbor as "right". */
@@ -2664,7 +2664,7 @@ __xfs_btree_split(
*/
lrecs = xfs_btree_get_numrecs(left);
rrecs = lrecs / 2;
- if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
rrecs++;
src_index = (lrecs - rrecs + 1);
@@ -2760,9 +2760,9 @@ __xfs_btree_split(
* If it's just pointing past the last entry in left, then we'll
* insert there, so don't change anything in that case.
*/
- if (cur->bc_ptrs[level] > lrecs + 1) {
+ if (cur->bc_levels[level].ptr > lrecs + 1) {
xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= lrecs;
+ cur->bc_levels[level].ptr -= lrecs;
}
/*
* If there are more levels, we'll need another cursor which refers
@@ -2772,7 +2772,7 @@ __xfs_btree_split(
error = xfs_btree_dup_cursor(cur, curp);
if (error)
goto error0;
- (*curp)->bc_ptrs[level + 1]++;
+ (*curp)->bc_levels[level + 1].ptr++;
}
*ptrp = rptr;
*stat = 1;
@@ -2785,6 +2785,7 @@ error0:
return error;
}
+#ifdef __KERNEL__
struct xfs_btree_split_args {
struct xfs_btree_cur *cur;
int level;
@@ -2817,7 +2818,7 @@ xfs_btree_split_worker(
* in any way.
*/
if (args->kswapd)
- new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ new_pflags |= PF_MEMALLOC | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
xfs_trans_set_context(args->cur->bc_tp);
@@ -2870,6 +2871,9 @@ xfs_btree_split(
destroy_work_on_stack(&args.work);
return args.result;
}
+#else
+#define xfs_btree_split __xfs_btree_split
+#endif /* __KERNEL__ */
/*
@@ -2933,7 +2937,8 @@ xfs_btree_new_iroot(
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ cur->bc_levels[level + 1].ptr = 1;
kp = xfs_btree_key_addr(cur, 1, block);
ckp = xfs_btree_key_addr(cur, 1, cblock);
@@ -3094,8 +3099,9 @@ xfs_btree_new_root(
/* Fix up the cursor. */
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_levels[cur->bc_nlevels].ptr = nptr;
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
*stat = 1;
return 0;
error0:
@@ -3152,7 +3158,7 @@ xfs_btree_make_block_unfull(
return error;
if (*stat) {
- *oindex = *index = cur->bc_ptrs[level];
+ *oindex = *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3167,7 +3173,7 @@ xfs_btree_make_block_unfull(
return error;
- *index = cur->bc_ptrs[level];
+ *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3214,7 +3220,7 @@ xfs_btree_insrec(
}
/* If we're off the left edge, return failure. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3557,7 +3563,7 @@ xfs_btree_kill_iroot(
if (error)
return error;
- cur->bc_bufs[level - 1] = NULL;
+ cur->bc_levels[level - 1].bp = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
@@ -3590,8 +3596,8 @@ xfs_btree_kill_root(
if (error)
return error;
- cur->bc_bufs[level] = NULL;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = NULL;
+ cur->bc_levels[level].ra = 0;
cur->bc_nlevels--;
return 0;
@@ -3650,7 +3656,7 @@ xfs_btree_delrec(
tcur = NULL;
/* Get the index of the entry being deleted, check for nothing there. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3960,7 +3966,7 @@ xfs_btree_delrec(
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
tcur = NULL;
if (level == 0)
- cur->bc_ptrs[0]++;
+ cur->bc_levels[0].ptr++;
*stat = 1;
return 0;
@@ -4097,9 +4103,9 @@ xfs_btree_delrec(
* cursor to the left block, and fix up the index.
*/
if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = lbp;
+ cur->bc_levels[level].ptr += lrecs;
+ cur->bc_levels[level].ra = 0;
}
/*
* If we joined with the right neighbor and there's a level above
@@ -4119,16 +4125,16 @@ xfs_btree_delrec(
* We can't use decrement because it would change the next level up.
*/
if (level > 0)
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
/*
* We combined blocks, so we have to update the parent keys if the
- * btree supports overlapped intervals. However, bc_ptrs[level + 1]
- * points to the old block so that the caller knows which record to
- * delete. Therefore, the caller must be savvy enough to call updkeys
- * for us if we return stat == 2. The other exit points from this
- * function don't require deletions further up the tree, so they can
- * call updkeys directly.
+ * btree supports overlapped intervals. However,
+ * bc_levels[level + 1].ptr points to the old block so that the caller
+ * knows which record to delete. Therefore, the caller must be savvy
+ * enough to call updkeys for us if we return stat == 2. The other
+ * exit points from this function don't require deletions further up
+ * the tree, so they can call updkeys directly.
*/
/* Return value means the next level up has something to do. */
@@ -4182,7 +4188,7 @@ xfs_btree_delete(
if (i == 0) {
for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
+ if (cur->bc_levels[level].ptr == 0) {
error = xfs_btree_decrement(cur, level, &i);
if (error)
goto error0;
@@ -4213,7 +4219,7 @@ xfs_btree_get_rec(
int error; /* error return value */
#endif
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
block = xfs_btree_get_block(cur, 0, &bp);
#ifdef DEBUG
@@ -4512,21 +4518,76 @@ xfs_btree_sblock_verify(
}
/*
- * Calculate the number of btree levels needed to store a given number of
- * records in a short-format btree.
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
*/
-uint
+unsigned int
xfs_btree_compute_maxlevels(
- uint *limits,
- unsigned long len)
+ const unsigned int *limits,
+ unsigned long long records)
{
- uint level;
- unsigned long maxblocks;
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned int height = 1;
- maxblocks = (len + limits[0] - 1) / limits[0];
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + limits[1] - 1) / limits[1];
- return level;
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ height++;
+ }
+
+ return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned long long blocks = level_blocks;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ blocks += level_blocks;
+ }
+
+ return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+ const unsigned int *limits,
+ unsigned long long leaf_blocks)
+{
+ unsigned long long node_blocks = limits[1];
+ unsigned long long blocks_left = leaf_blocks - 1;
+ unsigned int height = 1;
+
+ if (leaf_blocks < 1)
+ return 0;
+
+ while (node_blocks < blocks_left) {
+ blocks_left -= node_blocks;
+ node_blocks *= limits[1];
+ height++;
+ }
+
+ return height;
}
/*
@@ -4661,23 +4722,25 @@ xfs_btree_overlapped_query_range(
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
pop_up:
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
if (level == 0) {
/* Handle a leaf node. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
@@ -4700,14 +4763,15 @@ pop_up:
/* Record is larger than high key; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* Handle an internal node. */
- lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
- hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+ block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
@@ -4730,13 +4794,13 @@ pop_up:
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
continue;
} else if (hdiff < 0) {
/* The low key is larger than the upper range; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
}
out:
@@ -4747,13 +4811,14 @@ out:
* with a zero-results range query, so release the buffers if we
* failed to return any results.
*/
- if (cur->bc_bufs[0] == NULL) {
+ if (cur->bc_levels[0].bp == NULL) {
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i]) {
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
- cur->bc_bufs[i] = NULL;
- cur->bc_ptrs[i] = 0;
- cur->bc_ra[i] = 0;
+ if (cur->bc_levels[i].bp) {
+ xfs_trans_brelse(cur->bc_tp,
+ cur->bc_levels[i].bp);
+ cur->bc_levels[i].bp = NULL;
+ cur->bc_levels[i].ptr = 0;
+ cur->bc_levels[i].ra = 0;
}
}
}
@@ -4816,29 +4881,6 @@ xfs_btree_query_all(
return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
}
-/*
- * Calculate the number of blocks needed to store a given number of records
- * in a short-format (per-AG metadata) btree.
- */
-unsigned long long
-xfs_btree_calc_size(
- uint *limits,
- unsigned long long len)
-{
- int level;
- int maxrecs;
- unsigned long long rval;
-
- maxrecs = limits[0];
- for (level = 0, rval = 0; len > 1; level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- maxrecs = limits[1];
- rval += len;
- }
- return rval;
-}
-
static int
xfs_btree_count_blocks_helper(
struct xfs_btree_cur *cur,
@@ -4915,7 +4957,7 @@ xfs_btree_has_more_records(
block = xfs_btree_get_block(cur, 0, &bp);
/* There are still records in this block. */
- if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
return true;
/* There are more record blocks. */
@@ -4924,3 +4966,42 @@ xfs_btree_has_more_records(
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+ int error;
+
+ error = xfs_allocbt_init_cur_cache();
+ if (error)
+ return error;
+ error = xfs_inobt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_bmbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_refcountbt_init_cur_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_btree_destroy_cur_caches();
+ return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+ xfs_allocbt_destroy_cur_cache();
+ xfs_inobt_destroy_cur_cache();
+ xfs_bmbt_destroy_cur_cache();
+ xfs_rmapbt_destroy_cur_cache();
+ xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4eaf8517f850..22d9f411fde6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -13,8 +13,6 @@ struct xfs_trans;
struct xfs_ifork;
struct xfs_perag;
-extern kmem_zone_t *xfs_btree_cur_zone;
-
/*
* Generic key, ptr and record wrapper structures.
*
@@ -92,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
-#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
-
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
@@ -181,18 +177,18 @@ union xfs_btree_irec {
/* Per-AG btree information. */
struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
+ struct xfs_perag *pag;
union {
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
};
union {
struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
} refc;
struct {
- bool active; /* allocation cursor state */
+ bool active; /* allocation cursor state */
} abt;
};
};
@@ -212,26 +208,35 @@ struct xfs_btree_cur_ino {
#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
};
+struct xfs_btree_level {
+ /* buffer pointer */
+ struct xfs_buf *bp;
+
+ /* key/record number */
+ uint16_t ptr;
+
+ /* readahead info */
+#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
+ uint16_t ra;
+};
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
*/
-typedef struct xfs_btree_cur
+struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
- uint bc_flags; /* btree features - below */
+ struct kmem_cache *bc_cache; /* cursor cache */
+ unsigned int bc_flags; /* btree features - below */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
- struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
- int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
- uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
-#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
-#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
- uint8_t bc_nlevels; /* number of levels in the tree */
- uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
- int bc_statoff; /* offset of btre stats array */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ int bc_statoff; /* offset of btree stats array */
/*
* Short btree pointers need an agno to be able to turn the pointers
@@ -243,7 +248,21 @@ typedef struct xfs_btree_cur
struct xfs_btree_cur_ag bc_ag;
struct xfs_btree_cur_ino bc_ino;
};
-} xfs_btree_cur_t;
+
+ /* Must be at the end of the struct! */
+ struct xfs_btree_level bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height. The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
/* cursor flags */
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
@@ -258,7 +277,6 @@ typedef struct xfs_btree_cur
*/
#define XFS_BTREE_STAGING (1<<5)
-
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -309,7 +327,7 @@ xfs_btree_check_sptr(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int error); /* del because of error */
/*
@@ -318,8 +336,8 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur);/* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur);/* output cursor */
/*
* Compute first and last byte offsets for the fields given.
@@ -460,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+ unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+ unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+ unsigned long long blocks);
/*
* Return codes for the query range iterator function are 0 to continue
@@ -527,7 +549,7 @@ struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
/* Does this cursor point to the last block in the given level? */
static inline bool
xfs_btree_islastblock(
- xfs_btree_cur_t *cur,
+ struct xfs_btree_cur *cur,
int level)
{
struct xfs_btree_block *block;
@@ -558,4 +580,27 @@ void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_btnum_t btnum,
+ uint8_t maxlevels,
+ struct kmem_cache *cache)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_maxlevels = maxlevels;
+ cur->bc_cache = cache;
+
+ return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index ac9e80152b5c..dd75e208b543 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -657,12 +657,12 @@ xfs_btree_bload_compute_geometry(
* checking levels 0 and 1 here, so set bc_nlevels such that the btree
* code doesn't interpret either as the root level.
*/
- cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
+ cur->bc_nlevels = cur->bc_maxlevels - 1;
xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
bbl->nr_records = nr_this_level = nr_records;
- for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+ for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
uint64_t level_blocks;
uint64_t dontcare64;
unsigned int level = cur->bc_nlevels - 1;
@@ -703,6 +703,7 @@ xfs_btree_bload_compute_geometry(
* block-based btree level.
*/
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
xfs_btree_bload_level_geometry(cur, bbl, level,
nr_this_level, &avg_per_block,
&level_blocks, &dontcare64);
@@ -718,13 +719,14 @@ xfs_btree_bload_compute_geometry(
/* Otherwise, we need another level of btree. */
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
}
nr_blocks += level_blocks;
nr_this_level = level_blocks;
}
- if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+ if (cur->bc_nlevels > cur->bc_maxlevels)
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index c062e2c85178..9dc1ecb9713d 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -72,7 +72,7 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *save_blk);
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */
/*
* Allocate a dir-state structure.
@@ -84,7 +84,7 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -113,7 +113,7 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_cache_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_cache, state);
}
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
@@ -864,7 +864,6 @@ xfs_da3_node_rebalance(
{
struct xfs_da_intnode *node1;
struct xfs_da_intnode *node2;
- struct xfs_da_intnode *tmpnode;
struct xfs_da_node_entry *btree1;
struct xfs_da_node_entry *btree2;
struct xfs_da_node_entry *btree_s;
@@ -894,9 +893,7 @@ xfs_da3_node_rebalance(
((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
(be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
- tmpnode = node1;
- node1 = node2;
- node2 = tmpnode;
+ swap(node1, node2);
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
btree1 = nodehdr1.btree;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ad5dd324631a..0faf7d9ac241 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -9,7 +9,6 @@
struct xfs_inode;
struct xfs_trans;
-struct zone;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -227,6 +226,6 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
-extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_cache *xfs_da_state_cache;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eff4a127188e..0805ade2d300 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -18,6 +18,12 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+
+static struct kmem_cache *xfs_defer_pending_cache;
/*
* Deferred Operations in XFS
@@ -232,23 +238,20 @@ xfs_defer_trans_abort(
}
}
-/* Roll a transaction so we can do some deferred op processing. */
-STATIC int
-xfs_defer_trans_roll(
- struct xfs_trans **tpp)
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits. Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+ struct xfs_defer_resources *dres,
+ struct xfs_trans *tp)
{
- struct xfs_trans *tp = *tpp;
struct xfs_buf_log_item *bli;
struct xfs_inode_log_item *ili;
struct xfs_log_item *lip;
- struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
- struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
- unsigned int ordered = 0; /* bitmap */
- int bpcount = 0, ipcount = 0;
- int i;
- int error;
- BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS);
+ BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
@@ -256,28 +259,29 @@ xfs_defer_trans_roll(
bli = container_of(lip, struct xfs_buf_log_item,
bli_item);
if (bli->bli_flags & XFS_BLI_HOLD) {
- if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
+ if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
ASSERT(0);
return -EFSCORRUPTED;
}
if (bli->bli_flags & XFS_BLI_ORDERED)
- ordered |= (1U << bpcount);
+ dres->dr_ordered |=
+ (1U << dres->dr_bufs);
else
xfs_trans_dirty_buf(tp, bli->bli_buf);
- bplist[bpcount++] = bli->bli_buf;
+ dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
}
break;
case XFS_LI_INODE:
ili = container_of(lip, struct xfs_inode_log_item,
ili_item);
if (ili->ili_lock_flags == 0) {
- if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
+ if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
ASSERT(0);
return -EFSCORRUPTED;
}
xfs_trans_log_inode(tp, ili->ili_inode,
XFS_ILOG_CORE);
- iplist[ipcount++] = ili->ili_inode;
+ dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
}
break;
default:
@@ -285,7 +289,43 @@ xfs_defer_trans_roll(
}
}
- trace_xfs_defer_trans_roll(tp, _RET_IP_);
+ return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < dres->dr_inos; i++)
+ xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_trans_bjoin(tp, dres->dr_bp[i]);
+ if (dres->dr_ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+ xfs_trans_bhold(tp, dres->dr_bp[i]);
+ }
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tpp)
+{
+ struct xfs_defer_resources dres = { };
+ int error;
+
+ error = xfs_defer_save_resources(&dres, *tpp);
+ if (error)
+ return error;
+
+ trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
/*
* Roll the transaction. Rolling always given a new transaction (even
@@ -295,22 +335,11 @@ xfs_defer_trans_roll(
* happened.
*/
error = xfs_trans_roll(tpp);
- tp = *tpp;
- /* Rejoin the joined inodes. */
- for (i = 0; i < ipcount; i++)
- xfs_trans_ijoin(tp, iplist[i], 0);
-
- /* Rejoin the buffers and dirty them so the log moves forward. */
- for (i = 0; i < bpcount; i++) {
- xfs_trans_bjoin(tp, bplist[i]);
- if (ordered & (1U << i))
- xfs_trans_ordered_buf(tp, bplist[i]);
- xfs_trans_bhold(tp, bplist[i]);
- }
+ xfs_defer_restore_resources(*tpp, &dres);
if (error)
- trace_xfs_defer_trans_roll_error(tp, error);
+ trace_xfs_defer_trans_roll_error(*tpp, error);
return error;
}
@@ -342,7 +371,7 @@ xfs_defer_cancel_list(
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
}
}
@@ -439,7 +468,7 @@ xfs_defer_finish_one(
/* Done with the dfp, free it. */
list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
out:
if (ops->finish_cleanup)
ops->finish_cleanup(tp, state, error);
@@ -573,8 +602,8 @@ xfs_defer_add(
dfp = NULL;
}
if (!dfp) {
- dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_NOFS);
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
@@ -627,10 +656,11 @@ xfs_defer_move(
*/
static struct xfs_defer_capture *
xfs_defer_ops_capture(
- struct xfs_trans *tp,
- struct xfs_inode *capture_ip)
+ struct xfs_trans *tp)
{
struct xfs_defer_capture *dfc;
+ unsigned short i;
+ int error;
if (list_empty(&tp->t_dfops))
return NULL;
@@ -654,27 +684,48 @@ xfs_defer_ops_capture(
/* Preserve the log reservation size. */
dfc->dfc_logres = tp->t_log_res;
+ error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+ if (error) {
+ /*
+ * Resource capture should never fail, but if it does, we
+ * still have to shut down the log and release things
+ * properly.
+ */
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
/*
- * Grab an extra reference to this inode and attach it to the capture
- * structure.
+ * Grab extra references to the inodes and buffers because callers are
+ * expected to release their held references after we commit the
+ * transaction.
*/
- if (capture_ip) {
- ihold(VFS_I(capture_ip));
- dfc->dfc_capture_ip = capture_ip;
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+ ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
return dfc;
}
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_release(
+xfs_defer_ops_capture_free(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
+ unsigned short i;
+
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
- if (dfc->dfc_capture_ip)
- xfs_irele(dfc->dfc_capture_ip);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+ xfs_irele(dfc->dfc_held.dr_ip[i]);
+
kmem_free(dfc);
}
@@ -689,24 +740,21 @@ xfs_defer_ops_release(
int
xfs_defer_ops_capture_and_commit(
struct xfs_trans *tp,
- struct xfs_inode *capture_ip,
struct list_head *capture_list)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_defer_capture *dfc;
int error;
- ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
-
/* If we don't capture anything, commit transaction and exit. */
- dfc = xfs_defer_ops_capture(tp, capture_ip);
+ dfc = xfs_defer_ops_capture(tp);
if (!dfc)
return xfs_trans_commit(tp);
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
return error;
}
@@ -724,17 +772,19 @@ void
xfs_defer_ops_continue(
struct xfs_defer_capture *dfc,
struct xfs_trans *tp,
- struct xfs_inode **captured_ipp)
+ struct xfs_defer_resources *dres)
{
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock and join the captured inode to the new transaction. */
- if (dfc->dfc_capture_ip) {
- xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
- }
- *captured_ipp = dfc->dfc_capture_ip;
+ if (dfc->dfc_held.dr_inos == 2)
+ xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+ dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+ else if (dfc->dfc_held.dr_inos == 1)
+ xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+ xfs_defer_restore_resources(tp, &dfc->dfc_held);
+ memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -742,3 +792,82 @@ xfs_defer_ops_continue(
kmem_free(dfc);
}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ for (i = 0; i < dres->dr_inos; i++) {
+ xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+ xfs_irele(dres->dr_ip[i]);
+ dres->dr_ip[i] = NULL;
+ }
+
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_buf_relse(dres->dr_bp[i]);
+ dres->dr_bp[i] = NULL;
+ }
+
+ dres->dr_inos = 0;
+ dres->dr_bufs = 0;
+ dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+ xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+ sizeof(struct xfs_defer_pending),
+ 0, 0, NULL);
+
+ return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_defer_pending_cache);
+ xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+ int error;
+
+ error = xfs_defer_init_cache();
+ if (error)
+ return error;
+ error = xfs_rmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_refcount_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_bmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_extfree_intent_init_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_defer_destroy_item_caches();
+ return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+ xfs_extfree_intent_destroy_cache();
+ xfs_bmap_intent_destroy_cache();
+ xfs_refcount_intent_destroy_cache();
+ xfs_rmap_intent_destroy_cache();
+ xfs_defer_destroy_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 05472f71fffe..7bb8a31ad65b 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -65,6 +65,30 @@ extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+ /* held buffers */
+ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+ /* inodes with no unlock flags */
+ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+ /* number of held buffers */
+ unsigned short dr_bufs;
+
+ /* bitmap of ordered buffers */
+ unsigned short dr_ordered;
+
+ /* number of held inodes */
+ unsigned short dr_inos;
+};
+
+/*
* This structure enables a dfops user to detach the chain of deferred
* operations from a transaction so that they can be continued later.
*/
@@ -83,11 +107,7 @@ struct xfs_defer_capture {
/* Log reservation saved from the transaction. */
unsigned int dfc_logres;
- /*
- * An inode reference that must be maintained to complete the deferred
- * work.
- */
- struct xfs_inode *dfc_capture_ip;
+ struct xfs_defer_resources dfc_held;
};
/*
@@ -95,9 +115,14 @@ struct xfs_defer_capture {
* This doesn't normally happen except log recovery.
*/
int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
- struct xfs_inode *capture_ip, struct list_head *capture_list);
+ struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
- struct xfs_inode **captured_ipp);
-void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+ struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 50546eadaae2..5f1e4799e8fa 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,7 +19,11 @@
#include "xfs_error.h"
#include "xfs_trace.h"
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+const struct xfs_name xfs_name_dotdot = {
+ .name = (const unsigned char *)"..",
+ .len = 2,
+ .type = XFS_DIR3_FT_DIR,
+};
/*
* Convert inode mode to directory entry filetype
@@ -54,10 +58,10 @@ xfs_mode_to_ftype(
*/
xfs_dahash_t
xfs_ascii_ci_hashname(
- struct xfs_name *name)
+ const struct xfs_name *name)
{
- xfs_dahash_t hash;
- int i;
+ xfs_dahash_t hash;
+ int i;
for (i = 0, hash = 0; i < name->len; i++)
hash = tolower(name->name[i]) ^ rol32(hash, 7);
@@ -243,7 +247,7 @@ int
xfs_dir_createname(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -337,16 +341,16 @@ xfs_dir_cilookup_result(
int
xfs_dir_lookup(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_ino_t *inum, /* out: inode number */
- struct xfs_name *ci_name) /* out: actual name if CI match */
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
- struct xfs_da_args *args;
- int rval;
- int v; /* type-checking value */
- int lock_mode;
+ struct xfs_da_args *args;
+ int rval;
+ int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
@@ -475,7 +479,7 @@ int
xfs_dir_replace(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name, /* name of entry to replace */
+ const struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -728,7 +732,7 @@ xfs_dir2_namecheck(
xfs_dahash_t
xfs_dir2_hashname(
struct xfs_mount *mp,
- struct xfs_name *name)
+ const struct xfs_name *name)
{
if (unlikely(xfs_has_asciici(mp)))
return xfs_ascii_ci_hashname(name);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index d03e6098ded9..b6df3c34b26a 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -21,7 +21,7 @@ struct xfs_dir2_data_unused;
struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
-extern struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dotdot;
/*
* Convert inode mode to directory entry filetype
@@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t *inum,
+ const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 711709a2aa53..7404a9ff1a92 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr {
};
/* xfs_dir2.c */
-xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -201,7 +201,8 @@ xfs_dir2_data_entsize(
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
-xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+ const struct xfs_name *name);
enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index deeb74becabc..15a362e2f5ea 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
ASSERT(nbblks > 0);
- return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
+ return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
}
/*
@@ -127,7 +127,7 @@ xfs_dqblk_repair(
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(dqb, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(struct xfs_dqblk));
dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2d7057b7984b..d665c04e69dd 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -184,7 +184,7 @@ typedef struct xfs_sb {
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
*/
-typedef struct xfs_dsb {
+struct xfs_dsb {
__be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
__be32 sb_blocksize; /* logical block size, bytes */
__be64 sb_dblocks; /* number of data blocks */
@@ -263,7 +263,7 @@ typedef struct xfs_dsb {
uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
-} xfs_dsb_t;
+};
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
@@ -780,7 +780,7 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-typedef struct xfs_dinode {
+struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
@@ -825,7 +825,7 @@ typedef struct xfs_dinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
+};
#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
@@ -1215,7 +1215,7 @@ struct xfs_disk_dquot {
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
-typedef struct xfs_dqblk {
+struct xfs_dqblk {
struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
char dd_fill[4];/* filling for posterity */
@@ -1225,7 +1225,7 @@ typedef struct xfs_dqblk {
__be32 dd_crc; /* checksum */
__be64 dd_lsn; /* last modification in log */
uuid_t dd_uuid; /* location information */
-} xfs_dqblk_t;
+};
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index bde2b4c64dbe..505533c43a92 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -93,21 +93,6 @@ struct getbmapx {
#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
/*
- * Structure for XFS_IOC_FSSETDM.
- * For use by backup and restore programs to set the XFS on-disk inode
- * fields di_dmevmask and di_dmstate. These must be set to exactly and
- * only values previously obtained via xfs_bulkstat! (Specifically the
- * struct xfs_bstat fields bs_dmevmask and bs_dmstate.)
- */
-#ifndef HAVE_FSDMIDATA
-struct fsdmidata {
- __u32 fsd_dmevmask; /* corresponds to di_dmevmask */
- __u16 fsd_padding;
- __u16 fsd_dmstate; /* corresponds to di_dmstate */
-};
-#endif
-
-/*
* File segment locking set data type for 64 bit access.
* Also used for all the RESV/FREE interfaces.
*/
@@ -268,6 +253,8 @@ typedef struct xfs_fsop_resblks {
*/
#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */
#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */
+#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
/* keep the maximum size under 2^31 by a small amount */
#define XFS_MAX_LOG_BYTES \
@@ -560,16 +547,10 @@ typedef struct xfs_fsop_handlereq {
/*
* Compound structures for passing args through Handle Request interfaces
- * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
- * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
- * XFS_IOC_ATTRMULTI_BY_HANDLE
+ * xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE
*/
-typedef struct xfs_fsop_setdm_handlereq {
- struct xfs_fsop_handlereq hreq; /* handle information */
- struct fsdmidata __user *data; /* DMAPI data */
-} xfs_fsop_setdm_handlereq_t;
-
/*
* Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
*
@@ -779,15 +760,15 @@ struct xfs_scrub_metadata {
* For 'documentation' purposed more than anything else,
* the "cmd #" field reflects the IRIX fcntl number.
*/
-#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
-#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
+/* XFS_IOC_ALLOCSP ------- deprecated 10 */
+/* XFS_IOC_FREESP -------- deprecated 11 */
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
-#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
+/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */
+/* XFS_IOC_FREESP64 ------ deprecated 37 */
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
-#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata)
+/* XFS_IOC_FSSETDM ------- deprecated 39 */
#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64)
#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64)
#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64)
@@ -829,7 +810,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
-#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 994ad783d407..b418fe0c0679 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1827,7 +1827,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks,
&XFS_RMAP_OINFO_INODES);
return;
@@ -1872,7 +1872,7 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
@@ -2793,6 +2793,7 @@ xfs_ialloc_setup_geometry(
inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
inodes);
+ ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
/*
* Set the maximum inode count for this filesystem, being careful not
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 27190840c5d8..b2ad2fdc40f5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,6 +22,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_inobt_cur_cache;
+
STATIC int
xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -432,10 +434,8 @@ xfs_inobt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
if (btnum == XFS_BTNUM_INO) {
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
cur->bc_ops = &xfs_inobt_ops;
@@ -444,8 +444,6 @@ xfs_inobt_init_common(
cur->bc_ops = &xfs_finobt_ops;
}
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -530,6 +528,17 @@ xfs_inobt_commit_staged_btree(
}
}
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
/*
* Calculate number of records in an inobt btree block.
*/
@@ -540,10 +549,54 @@ xfs_inobt_maxrecs(
int leaf)
{
blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+ return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_inobt_rec_t);
- return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+/*
+ * Maximum number of inode btree records per AG. Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+ XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+ return max(xfs_inobt_maxlevels_ondisk(),
+ xfs_finobt_maxlevels_ondisk());
}
/*
@@ -761,3 +814,22 @@ xfs_iallocbt_calc_size(
{
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+ xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+ xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_inobt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_inobt_cur_cache);
+ xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 8a322d402e61..26451cb76b98 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -75,4 +75,9 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3932b4ebf903..cae9708c8587 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -51,9 +51,9 @@ xfs_inode_buf_verify(
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
- xfs_agino_t unlinked_ino;
+ struct xfs_dinode *dip;
+ xfs_agino_t unlinked_ino;
+ int di_ok;
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1d174909f9bd..9149f4f796fc 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,7 +26,7 @@
#include "xfs_types.h"
#include "xfs_errortag.h"
-kmem_zone_t *xfs_ifork_zone;
+struct kmem_cache *xfs_ifork_cache;
void
xfs_init_local_fork(
@@ -67,10 +67,10 @@ xfs_init_local_fork(
*/
STATIC int
xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork,
+ int size)
{
/*
* If the size is unreasonable, then something
@@ -162,8 +162,8 @@ xfs_iformat_extents(
*/
STATIC int
xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
@@ -284,7 +284,7 @@ xfs_ifork_alloc(
{
struct xfs_ifork *ifp;
- ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
ifp->if_format = format;
ifp->if_nextents = nextents;
return ifp;
@@ -325,7 +325,7 @@ xfs_iformat_attr_fork(
}
if (error) {
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
}
return error;
@@ -580,8 +580,8 @@ xfs_iextents_copy(
*/
void
xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
struct xfs_inode_log_item *iip,
int whichfork)
{
@@ -676,7 +676,7 @@ xfs_ifork_init_cow(
if (ip->i_cowfp)
return;
- ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone,
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
GFP_NOFS | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a6f7897b6887..3d64a3acb0ed 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -221,7 +221,7 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
xfs_iext_get_extent((ifp), (ext), (got)); \
xfs_iext_next((ifp), (ext)))
-extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_cache *xfs_ifork_cache;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index e5d767a7fc5d..327ba25e9e17 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -24,6 +24,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_refcount_intent_cache;
+
/* Allowable refcount adjustment amounts. */
enum xfs_refc_adjust_op {
XFS_REFCOUNT_ADJUST_INCREASE = 1,
@@ -916,8 +918,7 @@ xfs_refcount_adjust_extents(
struct xfs_btree_cur *cur,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
struct xfs_refcount_irec ext, tmp;
int error;
@@ -974,8 +975,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno,
- tmp.rc_blockcount, oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL);
}
(*agbno) += tmp.rc_blockcount;
@@ -1019,8 +1020,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
- oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL);
}
skip:
@@ -1048,8 +1049,7 @@ xfs_refcount_adjust(
xfs_extlen_t aglen,
xfs_agblock_t *new_agbno,
xfs_extlen_t *new_aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
@@ -1092,8 +1092,7 @@ xfs_refcount_adjust(
cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
- adj, oinfo);
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
if (error)
goto out_error;
@@ -1188,12 +1187,12 @@ xfs_refcount_finish_one(
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_ALLOC_COW:
@@ -1235,8 +1234,8 @@ __xfs_refcount_add(
type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
blockcount);
- ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_NOFS);
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1742,7 +1741,7 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
error = xfs_trans_commit(tp);
if (error)
@@ -1782,3 +1781,20 @@ xfs_refcount_has_record(
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+ xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+ sizeof(struct xfs_refcount_intent),
+ 0, 0, NULL);
+
+ return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_refcount_intent_cache);
+ xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 02cb3aa405be..9eb01edbd89d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -32,8 +32,8 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
enum xfs_refcount_intent_type ri_type;
- xfs_fsblock_t ri_startblock;
xfs_extlen_t ri_blockcount;
+ xfs_fsblock_t ri_startblock;
};
void xfs_refcount_increase_extent(struct xfs_trans *tp,
@@ -83,4 +83,9 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+extern struct kmem_cache *xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1ef9b99962ab..d14c1720b0fb 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -21,6 +21,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_refcountbt_cur_cache;
+
static struct xfs_btree_cur *
xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -322,11 +324,8 @@ xfs_refcountbt_init_common(
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = XFS_BTNUM_REFC;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -396,6 +395,18 @@ xfs_refcountbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
}
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
/*
* Calculate the number of records in a refcount btree block.
*/
@@ -405,11 +416,22 @@ xfs_refcountbt_maxrecs(
bool leaf)
{
blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+ return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_refcount_rec);
- return blocklen / (sizeof(struct xfs_refcount_key) +
- sizeof(xfs_refcount_ptr_t));
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of a refcount btree. */
@@ -417,8 +439,14 @@ void
xfs_refcountbt_compute_maxlevels(
struct xfs_mount *mp)
{
+ if (!xfs_has_reflink(mp)) {
+ mp->m_refc_maxlevels = 0;
+ return;
+ }
+
mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+ ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -488,3 +516,22 @@ xfs_refcountbt_calc_reserves(
return error;
}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+ xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+ xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_refcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_refcountbt_cur_cache);
+ xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index bd9ed9e1e41f..d66b37259bed 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -65,4 +65,9 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f45929b1b94a..cd322174dbff 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -24,6 +24,8 @@
#include "xfs_inode.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_rmap_intent_cache;
+
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
* in the btree given by cur.
@@ -2485,7 +2487,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2779,3 +2781,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
.oi_owner = XFS_RMAP_OWN_COW,
};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+ xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+ sizeof(struct xfs_rmap_intent),
+ 0, 0, NULL);
+
+ return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_rmap_intent_cache);
+ xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index fd67904ed446..b718ebeda372 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -159,8 +159,8 @@ enum xfs_rmap_intent_type {
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
- uint64_t ri_owner;
int ri_whichfork;
+ uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
};
@@ -215,4 +215,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+extern struct kmem_cache *xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index b7dbbfb3aeed..69e104d0277f 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+static struct kmem_cache *xfs_rmapbt_cur_cache;
+
/*
* Reverse map btree.
*
@@ -451,13 +453,10 @@ xfs_rmapbt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
/* Overlapping btree; 2 keys per pointer. */
- cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_ops = &xfs_rmapbt_ops;
@@ -522,6 +521,18 @@ xfs_rmapbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
}
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
/*
* Calculate number of records in an rmap btree block.
*/
@@ -531,11 +542,33 @@ xfs_rmapbt_maxrecs(
int leaf)
{
blocklen -= XFS_RMAP_BLOCK_LEN;
+ return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_rmap_rec);
- return blocklen /
- (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. However, we're likely to run
+ * out of blocks in the AG long before that happens, which means that
+ * we must compute the max height based on what the btree will look
+ * like if it consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of an rmap btree. */
@@ -543,26 +576,36 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- /*
- * On a non-reflink filesystem, the maximum number of rmap
- * records is the number of blocks in the AG, hence the max
- * rmapbt height is log_$maxrecs($agblocks). However, with
- * reflink each AG block can have up to 2^32 (per the refcount
- * record format) owners, which means that theoretically we
- * could face up to 2^64 rmap records.
- *
- * That effectively means that the max rmapbt height must be
- * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
- * blocks to feed the rmapbt long before the rmapbt reaches
- * maximum height. The reflink code uses ag_resv_critical to
- * disallow reflinking when less than 10% of the per-AG metadata
- * block reservation since the fallback is a regular file copy.
- */
- if (xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
- else
+ if (!xfs_has_rmapbt(mp)) {
+ mp->m_rmap_maxlevels = 0;
+ return;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * Compute the asymptotic maxlevels for an rmap btree on a
+ * filesystem that supports reflink.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32
+ * (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records.
+ * However, we're likely to run out of blocks in the AG long
+ * before that happens, which means that we must compute the
+ * max height based on what the btree will look like if it
+ * consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+ mp->m_sb.sb_agblocks);
+ } else {
+ /*
+ * If there's no block sharing, compute the maximum rmapbt
+ * height assuming one rmap record per AG block.
+ */
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ }
+ ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -633,3 +676,22 @@ xfs_rmapbt_calc_reserves(
return error;
}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+ xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rmapbt_cur_cache);
+ xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index f2eee6572af4..3244715dd111 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -59,4 +59,9 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index e58349be78bd..f4e84aa1d50a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -495,7 +495,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
static void
__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from,
+ struct xfs_dsb *from,
bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
@@ -571,7 +571,7 @@ __xfs_sb_from_disk(
void
xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ struct xfs_dsb *from)
{
__xfs_sb_from_disk(to, from, true);
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5e300daa2559..6f83d9b306ee 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -70,7 +70,7 @@ xfs_allocfree_log_count(
{
uint blocks;
- blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+ blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
if (xfs_has_reflink(mp))
@@ -814,6 +814,19 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
@@ -916,4 +929,7 @@ xfs_trans_resv_calc(
resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 50332be34388..87b31c69a773 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -17,6 +17,13 @@
/* Adding one rmap could split every level up to the top of the tree. */
#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9
+
/* Blocks we might need to add "b" rmaps to a tree. */
#define XFS_NRMAPADD_SPACE_RES(mp, b)\
(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
@@ -74,7 +81,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * (mp)->m_ag_maxlevels)
+ (2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ae3c9f6e2c69..90aebfe9dc5f 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -281,7 +281,7 @@ xchk_superblock(
features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
if ((sb->sb_features2 & features_mask) !=
(cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
- xchk_block_set_corrupt(sc, bp);
+ xchk_block_set_preen(sc, bp);
if (!xfs_has_crc(mp)) {
/* all v5 fields must be zero */
@@ -290,39 +290,38 @@ xchk_superblock(
offsetof(struct xfs_dsb, sb_features_compat)))
xchk_block_set_corrupt(sc, bp);
} else {
- /* Check compat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN);
- if ((sb->sb_features_compat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask))
+ /* compat features must match */
+ if (sb->sb_features_compat !=
+ cpu_to_be32(mp->m_sb.sb_features_compat))
xchk_block_set_corrupt(sc, bp);
- /* Check ro compat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN |
- XFS_SB_FEAT_RO_COMPAT_FINOBT |
- XFS_SB_FEAT_RO_COMPAT_RMAPBT |
- XFS_SB_FEAT_RO_COMPAT_REFLINK);
- if ((sb->sb_features_ro_compat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_ro_compat) &
- features_mask))
+ /* ro compat features must match */
+ if (sb->sb_features_ro_compat !=
+ cpu_to_be32(mp->m_sb.sb_features_ro_compat))
xchk_block_set_corrupt(sc, bp);
- /* Check incompat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN |
- XFS_SB_FEAT_INCOMPAT_FTYPE |
- XFS_SB_FEAT_INCOMPAT_SPINODES |
- XFS_SB_FEAT_INCOMPAT_META_UUID);
- if ((sb->sb_features_incompat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_incompat) &
- features_mask))
- xchk_block_set_corrupt(sc, bp);
+ /*
+ * NEEDSREPAIR is ignored on a secondary super, so we should
+ * clear it when we find it, though it's not a corruption.
+ */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+ if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+ sb->sb_features_incompat) & features_mask)
+ xchk_block_set_preen(sc, bp);
- /* Check log incompat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN);
- if ((sb->sb_features_log_incompat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_log_incompat) &
- features_mask))
+ /* all other incompat features must match */
+ if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+ sb->sb_features_incompat) & ~features_mask)
xchk_block_set_corrupt(sc, bp);
+ /*
+ * log incompat features protect newer log record types from
+ * older log recovery code. Log recovery doesn't check the
+ * secondary supers, so we can clear these if needed.
+ */
+ if (sb->sb_features_log_incompat)
+ xchk_block_set_preen(sc, bp);
+
/* Don't care about sb_crc */
if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
@@ -555,11 +554,11 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
@@ -568,7 +567,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -578,7 +577,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_refc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -850,6 +849,7 @@ xchk_agi(
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(sc->mp);
xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -880,7 +880,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
if (xfs_has_finobt(mp)) {
@@ -889,7 +889,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 0f8deee66f15..6da7f2ca77de 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -52,6 +52,18 @@ xrep_superblock(
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ /*
+ * Don't write out a secondary super with NEEDSREPAIR or log incompat
+ * features set, since both are ignored when set on a secondary.
+ */
+ if (xfs_has_crc(mp)) {
+ struct xfs_dsb *sb = bp->b_addr;
+
+ sb->sb_features_incompat &=
+ ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+ sb->sb_features_log_incompat = 0;
+ }
+
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
@@ -122,7 +134,7 @@ xrep_check_btree_root(
xfs_agnumber_t agno = sc->sm->sm_agno;
return xfs_verify_agbno(mp, agno, fab->root) &&
- fab->height <= XFS_BTREE_MAXLEVELS;
+ fab->height <= fab->maxlevels;
}
/*
@@ -339,18 +351,22 @@ xrep_agf(
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_bnobt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_cntbt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
+ .maxlevels = sc->mp->m_rmap_maxlevels,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
+ .maxlevels = sc->mp->m_refc_maxlevels,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -881,10 +897,12 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 1719e1c4da59..3590e10e3e62 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -24,7 +24,7 @@ struct xchk_xattr_buf {
* space bitmap follows immediately after; and we have a third buffer
* for storing intermediate bitmap results.
*/
- uint8_t buf[0];
+ uint8_t buf[];
};
/* A place to store attribute values. */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index d6d24c866bc4..b89bf9de9b1c 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -222,21 +222,21 @@ out:
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record
- * that we saw block 1. Then we observe that bc_ptrs[1] == 1, so we record
- * block 4. The list is [1, 4].
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
*
- * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the
- * loop. The list remains [1, 4].
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_ptrs[0] == 1 again, so we record that we saw block 2. We see that
- * bc_ptrs[1] == 2, so we exit the loop. The list is now [1, 4, 2].
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
- * For the 102nd record, bc_ptrs[0] == 2, so we continue.
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
- * For the 201st record, we've moved on to leaf block 3. bc_ptrs[0] == 1, so
- * we add 3 to the list. Now it is [1, 4, 2, 3].
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
@@ -256,7 +256,7 @@ xbitmap_set_btcur_path(
int i;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
xfs_btree_get_block(cur, i, &bp);
if (!bp)
continue;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 017da9ceaee9..a4cbbc346f60 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -402,7 +402,7 @@ xchk_bmapbt_rec(
* the root since the verifiers don't do that.
*/
if (xfs_has_crc(bs->cur->bc_mp) &&
- bs->cur->bc_ptrs[0] == 1) {
+ bs->cur->bc_levels[0].ptr == 1) {
for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
block = xfs_btree_get_block(bs->cur, i, &bp);
owner = be64_to_cpu(block->bb_u.l.bb_owner);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index eccb855dc904..39dd46f038fe 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -136,14 +136,14 @@ xchk_btree_rec(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, 0, &bp);
- rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
trace_xchk_btree_rec(bs->sc, cur, 0);
/* If this isn't the first record, are they in order? */
- if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ if (cur->bc_levels[0].ptr > 1 &&
+ !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
xchk_btree_set_corrupt(bs->sc, cur, 0);
- bs->firstrec = false;
memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
if (cur->bc_nlevels == 1)
@@ -152,7 +152,7 @@ xchk_btree_rec(
/* Is this at least as large as the parent low key? */
cur->bc_ops->init_key_from_rec(&key, rec);
keyblock = xfs_btree_get_block(cur, 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
@@ -161,7 +161,7 @@ xchk_btree_rec(
/* Is this no larger than the parent high key? */
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
@@ -183,23 +183,22 @@ xchk_btree_key(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
trace_xchk_btree_key(bs->sc, cur, level);
/* If this isn't the first key, are they in order? */
- if (!bs->firstkey[level] &&
- !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ if (cur->bc_levels[level].ptr > 1 &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
xchk_btree_set_corrupt(bs->sc, cur, level);
- bs->firstkey[level] = false;
- memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+ memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
if (level + 1 >= cur->bc_nlevels)
return;
/* Is this at least as large as the parent low key? */
keyblock = xfs_btree_get_block(cur, level + 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
@@ -207,8 +206,9 @@ xchk_btree_key(
return;
/* Is this no larger than the parent high key? */
- key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+ keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
}
@@ -291,7 +291,7 @@ xchk_btree_block_check_sibling(
/* Compare upper level pointer to sibling pointer. */
pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
- pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
if (!xchk_btree_ptr_ok(bs, level + 1, pp))
goto out;
if (pbp)
@@ -596,7 +596,7 @@ xchk_btree_block_keys(
/* Obtain the parent's copy of the keys for this block. */
parent_block = xfs_btree_get_block(cur, level + 1, &bp);
- parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
@@ -607,7 +607,7 @@ xchk_btree_block_keys(
/* Get high keys */
high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
- high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
@@ -627,35 +627,39 @@ xchk_btree(
const struct xfs_owner_info *oinfo,
void *private)
{
- struct xchk_btree bs = {
- .cur = cur,
- .scrub_rec = scrub_fn,
- .oinfo = oinfo,
- .firstrec = true,
- .private = private,
- .sc = sc,
- };
union xfs_btree_ptr ptr;
+ struct xchk_btree *bs;
union xfs_btree_ptr *pp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- int level;
struct xfs_buf *bp;
struct check_owner *co;
struct check_owner *n;
- int i;
+ size_t cur_sz;
+ int level;
int error = 0;
- /* Initialize scrub state */
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
- bs.firstkey[i] = true;
- INIT_LIST_HEAD(&bs.to_check);
-
- /* Don't try to check a tree with a height we can't handle. */
- if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ /*
+ * Allocate the btree scrub context from the heap, because this
+ * structure can get rather large. Don't let a caller feed us a
+ * totally absurd size.
+ */
+ cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
+ if (cur_sz > PAGE_SIZE) {
xchk_btree_set_corrupt(sc, cur, 0);
- goto out;
+ return 0;
}
+ bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ if (!bs)
+ return -ENOMEM;
+ bs->cur = cur;
+ bs->scrub_rec = scrub_fn;
+ bs->oinfo = oinfo;
+ bs->private = private;
+ bs->sc = sc;
+
+ /* Initialize scrub state */
+ INIT_LIST_HEAD(&bs->to_check);
/*
* Load the root of the btree. The helper function absorbs
@@ -663,79 +667,82 @@ xchk_btree(
*/
level = cur->bc_nlevels - 1;
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
- if (!xchk_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
- error = xchk_btree_get_block(&bs, level, &ptr, &block, &bp);
+ error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
if (level == 0) {
/* End of leaf, pop back towards the root. */
- if (cur->bc_ptrs[level] >
+ if (cur->bc_levels[level].ptr >
be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Records in order for scrub? */
- xchk_btree_rec(&bs);
+ xchk_btree_rec(bs);
/* Call out to the record checker. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
- error = bs.scrub_rec(&bs, recp);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
+ error = bs->scrub_rec(bs, recp);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Keys in order for scrub? */
- xchk_btree_key(&bs, level);
+ xchk_btree_key(bs, level);
/* Drill another level deeper. */
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
- if (!xchk_btree_ptr_ok(&bs, level, pp)) {
- cur->bc_ptrs[level]++;
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+ if (!xchk_btree_ptr_ok(bs, level, pp)) {
+ cur->bc_levels[level].ptr++;
continue;
}
level--;
- error = xchk_btree_get_block(&bs, level, pp, &block, &bp);
+ error = xchk_btree_get_block(bs, level, pp, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
}
out:
/* Process deferred owner checks on btree blocks. */
- list_for_each_entry_safe(co, n, &bs.to_check, list) {
- if (!error && bs.cur)
- error = xchk_btree_check_block_owner(&bs,
- co->level, co->daddr);
+ list_for_each_entry_safe(co, n, &bs->to_check, list) {
+ if (!error && bs->cur)
+ error = xchk_btree_check_block_owner(bs, co->level,
+ co->daddr);
list_del(&co->list);
kmem_free(co);
}
+ kmem_free(bs);
return error;
}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index b7d2fc01fbf9..da61a53a0b61 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -39,11 +39,22 @@ struct xchk_btree {
/* internal scrub state */
union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
struct list_head to_check;
+
+ /* this element must come last! */
+ union xfs_btree_key lastkey[];
};
+
+/*
+ * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for
+ * keys because we track leaf records separately in lastrec.
+ */
+static inline size_t
+xchk_btree_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1);
+}
+
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 8a52514bc1ff..b962cfbbd92b 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -473,7 +473,7 @@ xchk_da_btree(
xchk_da_btree_rec_fn scrub_fn,
void *private)
{
- struct xchk_da_btree ds = {};
+ struct xchk_da_btree *ds;
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
@@ -486,32 +486,35 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds.dargs.dp = sc->ip;
- ds.dargs.whichfork = whichfork;
- ds.dargs.trans = sc->tp;
- ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
- ds.state = xfs_da_state_alloc(&ds.dargs);
- ds.sc = sc;
- ds.private = private;
+ ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ if (!ds)
+ return -ENOMEM;
+ ds->dargs.dp = sc->ip;
+ ds->dargs.whichfork = whichfork;
+ ds->dargs.trans = sc->tp;
+ ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->state = xfs_da_state_alloc(&ds->dargs);
+ ds->sc = sc;
+ ds->private = private;
if (whichfork == XFS_ATTR_FORK) {
- ds.dargs.geo = mp->m_attr_geo;
- ds.lowest = 0;
- ds.highest = 0;
+ ds->dargs.geo = mp->m_attr_geo;
+ ds->lowest = 0;
+ ds->highest = 0;
} else {
- ds.dargs.geo = mp->m_dir_geo;
- ds.lowest = ds.dargs.geo->leafblk;
- ds.highest = ds.dargs.geo->freeblk;
+ ds->dargs.geo = mp->m_dir_geo;
+ ds->lowest = ds->dargs.geo->leafblk;
+ ds->highest = ds->dargs.geo->freeblk;
}
- blkno = ds.lowest;
+ blkno = ds->lowest;
level = 0;
/* Find the root of the da tree, if present. */
- blks = ds.state->path.blk;
- error = xchk_da_btree_block(&ds, level, blkno);
+ blks = ds->state->path.blk;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out_state;
/*
- * We didn't find a block at ds.lowest, which means that there's
+ * We didn't find a block at ds->lowest, which means that there's
* no LEAF1/LEAFN tree (at least not where it's supposed to be),
* so jump out now.
*/
@@ -523,16 +526,16 @@ xchk_da_btree(
/* Handle leaf block. */
if (blks[level].magic != XFS_DA_NODE_MAGIC) {
/* End of leaf, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Dispatch record scrubbing. */
- error = scrub_fn(&ds, level);
+ error = scrub_fn(ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -545,17 +548,17 @@ xchk_da_btree(
/* End of node, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_node_entry(&ds, level);
- error = xchk_da_btree_hash(&ds, level, &key->hashval);
+ key = xchk_da_btree_node_entry(ds, level);
+ error = xchk_da_btree_hash(ds, level, &key->hashval);
if (error)
goto out;
@@ -564,11 +567,11 @@ xchk_da_btree(
level++;
if (level >= XFS_DA_NODE_MAXDEPTH) {
/* Too deep! */
- xchk_da_set_corrupt(&ds, level - 1);
+ xchk_da_set_corrupt(ds, level - 1);
break;
}
- ds.tree_level--;
- error = xchk_da_btree_block(&ds, level, blkno);
+ ds->tree_level--;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out;
if (blks[level].bp == NULL)
@@ -587,6 +590,7 @@ out:
}
out_state:
- xfs_da_state_free(ds.state);
+ xfs_da_state_free(ds->state);
+ kmem_free(ds);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 200a63f58fe7..38897adde7b5 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -497,6 +497,7 @@ STATIC int
xchk_directory_leaf1_bestfree(
struct xfs_scrub *sc,
struct xfs_da_args *args,
+ xfs_dir2_db_t last_data_db,
xfs_dablk_t lblk)
{
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -534,10 +535,14 @@ xchk_directory_leaf1_bestfree(
}
/*
- * There should be as many bestfree slots as there are dir data
- * blocks that can fit under i_size.
+ * There must be enough bestfree slots to cover all the directory data
+ * blocks that we scanned. It is possible for there to be a hole
+ * between the last data block and i_disk_size. This seems like an
+ * oversight to the scrub author, but as we have been writing out
+ * directories like this (and xfs_repair doesn't mind them) for years,
+ * that's what we have to check.
*/
- if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) {
+ if (bestcount != last_data_db + 1) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
@@ -669,6 +674,7 @@ xchk_directory_blocks(
xfs_fileoff_t lblk;
struct xfs_iext_cursor icur;
xfs_dablk_t dabno;
+ xfs_dir2_db_t last_data_db = 0;
bool found;
int is_block = 0;
int error;
@@ -712,6 +718,7 @@ xchk_directory_blocks(
args.geo->fsbcount);
lblk < got.br_startoff + got.br_blockcount;
lblk += args.geo->fsbcount) {
+ last_data_db = xfs_dir2_da_to_db(args.geo, lblk);
error = xchk_directory_data_bestfree(sc, lblk,
is_block);
if (error)
@@ -734,7 +741,7 @@ xchk_directory_blocks(
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
- error = xchk_directory_leaf1_bestfree(sc, &args,
+ error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db,
leaf_lblk);
if (error)
goto out;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 2405b09d03d0..eac15af7b08c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -233,6 +233,7 @@ xchk_dinode(
unsigned long long isize;
uint64_t flags2;
uint32_t nextents;
+ prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -267,6 +268,7 @@ xchk_dinode(
* so just mark this inode for preening.
*/
xchk_ino_set_preen(sc, ino);
+ prid = 0;
break;
case 2:
case 3:
@@ -279,12 +281,17 @@ xchk_dinode(
if (dip->di_projid_hi != 0 &&
!xfs_has_projid32(mp))
xchk_ino_set_corrupt(sc, ino);
+
+ prid = be16_to_cpu(dip->di_projid_lo);
break;
default:
xchk_ino_set_corrupt(sc, ino);
return;
}
+ if (xfs_has_projid32(mp))
+ prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16;
+
/*
* di_uid/di_gid -- -1 isn't invalid, but there's no way that
* userspace could have created that.
@@ -293,6 +300,13 @@ xchk_dinode(
dip->di_gid == cpu_to_be32(-1U))
xchk_ino_set_warning(sc, ino);
+ /*
+ * project id of -1 isn't supposed to be valid, but the kernel didn't
+ * always validate that.
+ */
+ if (prid == -1U)
+ xchk_ino_set_warning(sc, ino);
+
/* di_format */
switch (dip->di_format) {
case XFS_DINODE_FMT_DEV:
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index d6c1b00a4fc8..3c7506c7553c 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -48,10 +48,10 @@ xchk_setup_quota(
dqtype = xchk_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
- sc->flags |= XCHK_HAS_QUOTAOFFLOCK;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
+
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
+
error = xchk_setup_fs(sc);
if (error)
return error;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 8f3cba14ada3..1e7b6b209ee8 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -25,6 +25,7 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
+#include "xfs_qm.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -912,11 +913,13 @@ xrep_force_quotacheck(
if (!(flag & sc->mp->m_qflags))
return;
+ mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
sc->mp->m_qflags &= ~flag;
spin_lock(&sc->mp->m_sb_lock);
sc->mp->m_sb.sb_qflags &= ~flag;
spin_unlock(&sc->mp->m_sb_lock);
xfs_log_sb(sc->tp);
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
}
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 3bb152d52a07..840f74ec431c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -44,6 +44,9 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
+ /* in: maximum btree height */
+ unsigned int maxlevels;
+
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 51e4c61916d2..b11870d07c56 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -173,10 +173,6 @@ xchk_teardown(
mnt_drop_write_file(sc->file);
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
- if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
- }
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -461,15 +457,10 @@ xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc = {
- .file = file,
- .sm = sm,
- };
+ struct xfs_scrub *sc;
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
int error = 0;
- sc.mp = mp;
-
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
(sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
@@ -489,59 +480,68 @@ xfs_scrub_metadata(
xchk_experimental_warning(mp);
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ if (!sc) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ sc->mp = mp;
+ sc->file = file;
+ sc->sm = sm;
+ sc->ops = &meta_scrub_ops[sm->sm_type];
+ sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
* scrub is running with a real transaction.
*/
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
- error = mnt_want_write_file(sc.file);
+ error = mnt_want_write_file(sc->file);
if (error)
- goto out;
+ goto out_sc;
}
/* Set up for the operation. */
- error = sc.ops->setup(&sc);
+ error = sc->ops->setup(sc);
if (error)
goto out_teardown;
/* Scrub for errors. */
- error = sc.ops->scrub(&sc);
- if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+ error = sc->ops->scrub(sc);
+ if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
* preparation for worst-case scenarios.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error)
- goto out;
- sc.flags |= XCHK_TRY_HARDER;
+ goto out_sc;
+ sc->flags |= XCHK_TRY_HARDER;
goto retry_op;
} else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
goto out_teardown;
- xchk_update_health(&sc);
+ xchk_update_health(sc);
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc.flags & XREP_ALREADY_FIXED)) {
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
+ needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
if (!needs_fix) {
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
@@ -549,26 +549,28 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(&sc);
+ error = xrep_attempt(sc);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
* get all the resources it needs; either way, we go
* back to the beginning and call the scrub function.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error) {
xrep_failure(mp);
- goto out;
+ goto out_sc;
}
goto retry_op;
}
}
out_nofix:
- xchk_postmortem(&sc);
+ xchk_postmortem(sc);
out_teardown:
- error = xchk_teardown(&sc, error);
+ error = xchk_teardown(sc, error);
+out_sc:
+ kmem_free(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 80e5026bba44..3de5287e98d8 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -88,7 +88,6 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
-#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */
#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index c0ef53fe6611..b5f94676c37c 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,13 +21,14 @@ xchk_btree_cur_fsbno(
struct xfs_btree_cur *cur,
int level)
{
- if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ if (level < cur->bc_nlevels && cur->bc_levels[level].bp)
return XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(cur->bc_bufs[level]));
- if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ xfs_buf_daddr(cur->bc_levels[level].bp));
+
+ if (level == cur->bc_nlevels - 1 &&
+ (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0);
+
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index a7bbb84f91a7..93ece6df02e3 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -348,7 +348,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->error = error;
__entry->ret_ip = ret_ip;
),
@@ -389,7 +389,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->error = error;
@@ -431,7 +431,7 @@ TRACE_EVENT(xchk_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -471,7 +471,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -511,7 +511,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
),
TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 34fc6148032a..90b7f4d127de 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -82,6 +82,7 @@ xfs_end_ioend(
struct iomap_ioend *ioend)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -97,18 +98,26 @@ xfs_end_ioend(
/*
* Just clean up the in-memory structures if the fs has been shut down.
*/
- if (xfs_is_shutdown(ip->i_mount)) {
+ if (xfs_is_shutdown(mp)) {
error = -EIO;
goto done;
}
/*
- * Clean up any COW blocks on an I/O error.
+ * Clean up all COW blocks and underlying data fork delalloc blocks on
+ * I/O error. The delalloc punch is required because this ioend was
+ * mapped to blocks in the COW fork and the associated pages are no
+ * longer dirty. If we don't remove delalloc blocks here, they become
+ * stale and can corrupt free space accounting on unmount.
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
+ xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, offset),
+ XFS_B_TO_FSB(mp, size));
+ }
goto done;
}
@@ -127,7 +136,20 @@ done:
memalloc_nofs_restore(nofs_flag);
}
-/* Finish all pending io completions. */
+/*
+ * Finish all pending IO completions that require transactional modifications.
+ *
+ * We try to merge physical and logically contiguous ioends before completion to
+ * minimise the number of transactions we need to perform during IO completion.
+ * Both unwritten extent conversion and COW remapping need to iterate and modify
+ * one physical extent at a time, so we gain nothing by merging physically
+ * discontiguous extents here.
+ *
+ * The ioend chain length that we can be processing here is largely unbound in
+ * length and we may have to perform significant amounts of work on each ioend
+ * to complete it. Hence we have to be careful about holding the CPU for too
+ * long in this loop.
+ */
void
xfs_end_io(
struct work_struct *work)
@@ -148,6 +170,7 @@ xfs_end_io(
list_del_init(&ioend->io_list);
iomap_ioend_try_merge(ioend, &tmp);
xfs_end_ioend(ioend);
+ cond_resched();
}
}
@@ -350,7 +373,7 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
@@ -428,37 +451,37 @@ xfs_prepare_ioend(
* see a ENOSPC in writeback).
*/
static void
-xfs_discard_page(
- struct page *page,
- loff_t fileoff)
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- unsigned int pageoff = offset_in_page(fileoff);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff);
- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
+ size_t offset = offset_in_folio(folio, pos);
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
+ xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
int error;
if (xfs_is_shutdown(mp))
goto out_invalidate;
xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
- page, ip->i_ino, fileoff);
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_page(inode, page) - pageoff_fsb);
+ i_blocks_per_folio(inode, folio) - pageoff_fsb);
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
+ iomap_invalidate_folio(folio, offset, folio_size(folio) - offset);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
.prepare_ioend = xfs_prepare_ioend,
- .discard_page = xfs_discard_page,
+ .discard_folio = xfs_discard_folio,
};
STATIC int
@@ -544,9 +567,9 @@ const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -558,7 +581,6 @@ const struct address_space_operations xfs_address_space_operations = {
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2b5da6218977..27265771f247 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -390,7 +390,7 @@ out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
if (dp->i_afp) {
xfs_idestroy_fork(dp->i_afp);
- kmem_cache_free(xfs_ifork_zone, dp->i_afp);
+ kmem_cache_free(xfs_ifork_cache, dp->i_afp);
dp->i_afp = NULL;
}
if (lock_mode)
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 667e297f59b1..ae4345b37621 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count)
return bio_max_segs(howmany(count, PAGE_SIZE));
}
-static void
-xfs_flush_bdev_async_endio(
- struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-/*
- * Submit a request for an async cache flush to run. If the request queue does
- * not require flush operations, just skip it altogether. If the caller needs
- * to wait for the flush completion at a later point in time, they must supply a
- * valid completion. This will be signalled when the flush completes. The
- * caller never sees the bio that is issued here.
- */
-void
-xfs_flush_bdev_async(
- struct bio *bio,
- struct block_device *bdev,
- struct completion *done)
-{
- struct request_queue *q = bdev->bd_disk->queue;
-
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
- complete(done);
- return;
- }
-
- bio_init(bio, NULL, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- bio->bi_private = done;
- bio->bi_end_io = xfs_flush_bdev_async_endio;
-
- submit_bio(bio);
-}
int
xfs_rw_bdev(
struct block_device *bdev,
@@ -61,10 +26,9 @@ xfs_rw_bdev(
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op | REQ_META | REQ_SYNC;
do {
struct page *page = kmem_to_page(data);
@@ -74,10 +38,9 @@ xfs_rw_bdev(
while (bio_add_page(bio, page, len, off) != len) {
struct bio *prev = bio;
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_copy_dev(bio, prev);
+ bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio->bi_opf = prev->bi_opf;
bio_chain(prev, bio);
submit_bio(prev);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 03159970133f..761dde155099 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_bui_zone;
-kmem_zone_t *xfs_bud_zone;
+struct kmem_cache *xfs_bui_cache;
+struct kmem_cache *xfs_bud_cache;
static const struct xfs_item_ops xfs_bui_item_ops;
@@ -39,7 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_cache_free(xfs_bui_zone, buip);
+ kmem_cache_free(xfs_bui_cache, buip);
}
/*
@@ -138,7 +138,7 @@ xfs_bui_init(
{
struct xfs_bui_log_item *buip;
- buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL);
+ buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -198,7 +198,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_cache_free(xfs_bud_zone, budp);
+ kmem_cache_free(xfs_bud_cache, budp);
}
static const struct xfs_item_ops xfs_bud_item_ops = {
@@ -215,7 +215,7 @@ xfs_trans_get_bud(
{
struct xfs_bud_log_item *budp;
- budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -384,7 +384,7 @@ xfs_bmap_update_finish_item(
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
return error;
}
@@ -404,7 +404,7 @@ xfs_bmap_update_cancel_item(
struct xfs_bmap_intent *bmap;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
}
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
@@ -463,7 +463,7 @@ xfs_bui_item_recover(
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_filblks_t count;
@@ -532,7 +532,7 @@ xfs_bui_item_recover(
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
*/
- error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
if (error)
goto err_unlock;
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index b9be62f8bd52..3fafd3881a0b 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -25,7 +25,7 @@
/* kernel only BUI/BUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -65,7 +65,7 @@ struct xfs_bud_log_item {
struct xfs_bud_log_format bud_format;
};
-extern struct kmem_zone *xfs_bui_zone;
-extern struct kmem_zone *xfs_bud_zone;
+extern struct kmem_cache *xfs_bui_cache;
+extern struct kmem_cache *xfs_bud_cache;
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 73a36b7be3bd..eb2e387ba528 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -771,8 +771,7 @@ int
xfs_alloc_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len,
- int alloc_type)
+ xfs_off_t len)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
@@ -851,9 +850,6 @@ xfs_alloc_file_space(
rblocks = 0;
}
- /*
- * Allocate and setup the transaction.
- */
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
dblocks, rblocks, false, &tp);
if (error)
@@ -865,14 +861,14 @@ xfs_alloc_file_space(
goto error;
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, alloc_type, 0, imapp,
- &nimaps);
+ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+ &nimaps);
if (error)
goto error;
- /*
- * Complete the transaction
- */
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
@@ -1001,7 +997,7 @@ xfs_free_file_space(
/*
* Now that we've unmap all full blocks we'll have to zero out any
- * partial block at the beginning and/or end. iomap_zero_range is smart
+ * partial block at the beginning and/or end. xfs_zero_range is smart
* enough to skip any holes, including those we just created, but we
* must take care not to zero beyond EOF and enlarge i_size.
*/
@@ -1009,15 +1005,14 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, offset, len, NULL);
if (error)
return error;
/*
* If we zeroed right up to EOF and EOF straddles a page boundary we
* must make sure that the post-EOF area is also zeroed because the
- * page could be mmap'd and iomap_zero_range doesn't do that for us.
+ * page could be mmap'd and xfs_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 9f993168b55b..24b37d211f1d 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len, int alloc_type);
+ xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5fa6cd947dd4..bf4e60871068 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -14,13 +14,14 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_recover.h"
+#include "xfs_log_priv.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
-static kmem_zone_t *xfs_buf_zone;
+static struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -220,7 +221,7 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -247,7 +248,7 @@ _xfs_buf_alloc(
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
return error;
}
@@ -307,7 +308,7 @@ xfs_buf_free(
kmem_free(bp->b_addr);
xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
}
static int
@@ -394,7 +395,7 @@ xfs_buf_alloc_pages(
}
XFS_STATS_INC(bp->b_mount, xb_page_retries);
- congestion_wait(BLK_RW_ASYNC, HZ / 50);
+ memalloc_retry_wait(gfp_mask);
}
return 0;
}
@@ -405,7 +406,7 @@ xfs_buf_alloc_pages(
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
- uint flags)
+ xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
@@ -813,7 +814,15 @@ xfs_buf_read_map(
* buffer.
*/
if (error) {
- if (!xfs_is_shutdown(target->bt_mount))
+ /*
+ * Check against log shutdown for error reporting because
+ * metadata writeback may require a read first and we need to
+ * report errors in metadata writeback until the log is shut
+ * down. High level transaction read functions already check
+ * against mount shutdown, anyway, so we only need to be
+ * concerned about low level IO interactions here.
+ */
+ if (!xlog_is_shutdown(target->bt_mount->m_log))
xfs_buf_ioerror_alert(bp, fa);
bp->b_flags &= ~XBF_DONE;
@@ -843,9 +852,6 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
- return;
-
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -862,7 +868,7 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -897,7 +903,7 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
@@ -1177,10 +1183,10 @@ xfs_buf_ioend_handle_error(
struct xfs_error_cfg *cfg;
/*
- * If we've already decided to shutdown the filesystem because of I/O
- * errors, there's no point in giving this a retry.
+ * If we've already shutdown the journal because of I/O errors, there's
+ * no point in giving this a retry.
*/
- if (xfs_is_shutdown(mp))
+ if (xlog_is_shutdown(mp->m_log))
goto out_stale;
xfs_buf_ioerror_alert_ratelimited(bp);
@@ -1440,12 +1446,10 @@ next_chunk:
atomic_inc(&bp->b_io_remaining);
nr_pages = bio_max_segs(total_nr_pages);
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio_set_dev(bio, bp->b_target->bt_bdev);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1593,8 +1597,23 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* on shutdown we stale and complete the buffer immediately */
- if (xfs_is_shutdown(bp->b_mount)) {
+ /*
+ * On log shutdown we stale and complete the buffer immediately. We can
+ * be called to read the superblock before the log has been set up, so
+ * be careful checking the log state.
+ *
+ * Checking the mount shutdown state here can result in the log tail
+ * moving inappropriately on disk as the log may not yet be shut down.
+ * i.e. failing this buffer on mount shutdown can remove it from the AIL
+ * and move the tail of the log forwards without having written this
+ * buffer to disk. This corrupts the log tail state in memory, and
+ * because the log may not be shut down yet, it can then be propagated
+ * to disk before the log is shutdown. Hence we check log shutdown
+ * state here rather than mount state to avoid corrupting the log tail
+ * on shutdown.
+ */
+ if (bp->b_mount->m_log &&
+ xlog_is_shutdown(bp->b_mount->m_log)) {
xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1808,10 +1827,10 @@ xfs_buftarg_drain(
* If one or more failed buffers were freed, that means dirty metadata
* was thrown away. This should only ever happen after I/O completion
* handling has elevated I/O error(s) to permanent failures and shuts
- * down the fs.
+ * down the journal.
*/
if (write_fail) {
- ASSERT(xfs_is_shutdown(btp->bt_mount));
+ ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
xfs_alert(btp->bt_mount,
"Please run xfs_repair to determine the extent of the problem.");
}
@@ -1892,6 +1911,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
+ fs_put_dax(btp->bt_daxdev);
kmem_free(btp);
}
@@ -1932,11 +1952,10 @@ xfs_setsize_buftarg_early(
return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
-xfs_buftarg_t *
+struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- struct dax_device *dax_dev)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
@@ -1945,7 +1964,7 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = dax_dev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
@@ -2094,12 +2113,13 @@ xfs_buf_delwri_submit_buffers(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
+ if (!xfs_buf_trylock(bp))
+ continue;
if (xfs_buf_ispinned(bp)) {
+ xfs_buf_unlock(bp);
pinned++;
continue;
}
- if (!xfs_buf_trylock(bp))
- continue;
} else {
xfs_buf_lock(bp);
}
@@ -2258,12 +2278,12 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD,
NULL);
- if (!xfs_buf_zone)
+ if (!xfs_buf_cache)
goto out;
return 0;
@@ -2275,7 +2295,7 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- kmem_cache_destroy(xfs_buf_zone);
+ kmem_cache_destroy(xfs_buf_cache);
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6b0200b8007d..1ee3056ff9cf 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,28 +22,28 @@ struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
/* buffer type flags for write callbacks */
-#define _XBF_INODES (1 << 16)/* inode buffer */
-#define _XBF_DQUOTS (1 << 17)/* dquot buffer */
-#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */
+#define _XBF_INODES (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */
+#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_INODES, "INODES" }, \
{ _XBF_DQUOTS, "DQUOTS" }, \
- { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
@@ -89,6 +89,7 @@ typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
size_t bt_meta_sectormask;
@@ -246,11 +247,11 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
- struct xfs_buf **bpp);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, int flags, struct xfs_buf **bpp,
- const struct xfs_buf_ops *ops);
+ size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
@@ -338,8 +339,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
/*
* Handling of buftargs.
*/
-extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, struct dax_device *);
+struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
+ struct block_device *bdev);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index b1ab100c09e1..522d450a94b1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -21,9 +21,10 @@
#include "xfs_dquot.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
-kmem_zone_t *xfs_buf_item_zone;
+struct kmem_cache *xfs_buf_item_cache;
static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
{
@@ -428,7 +429,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_has_v3inodes(lip->li_mountp) ||
+ if (xfs_has_v3inodes(lip->li_log->l_mp) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -616,7 +617,7 @@ xfs_buf_item_put(
* that case, the bli is freed on buffer writeback completion.
*/
aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xfs_is_shutdown(lip->li_mountp);
+ xlog_is_shutdown(lip->li_log);
dirty = bip->bli_flags & XFS_BLI_DIRTY;
if (dirty && !aborted)
return false;
@@ -804,7 +805,7 @@ xfs_buf_item_init(
return 0;
}
- bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
+ bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -825,7 +826,7 @@ xfs_buf_item_init(
map_size = DIV_ROUND_UP(chunks, NBWORD);
if (map_size > XFS_BLF_DATAMAP_SIZE) {
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
xfs_err(mp,
"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
map_size,
@@ -1002,7 +1003,7 @@ xfs_buf_item_free(
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 50aa0f5ef959..e11e9ef2338f 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -71,6 +71,6 @@ static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
-extern kmem_zone_t *xfs_buf_item_zone;
+extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index a476c7ef5d53..e484251dc9c8 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -603,7 +603,7 @@ xlog_recover_do_inode_buffer(
inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ offsetof(struct xfs_dinode, di_next_unlinked);
while (next_unlinked_offset >=
(reg_buf_offset + reg_buf_bytes)) {
@@ -816,7 +816,7 @@ xlog_recover_get_buf_lsn(
}
if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
goto recover_immediately;
return lsn;
}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8310005af00f..a7174a5b3203 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -138,7 +138,8 @@ xfs_dir2_sf_getdents(
STATIC int
xfs_dir2_block_getdents(
struct xfs_da_args *args,
- struct dir_context *ctx)
+ struct dir_context *ctx,
+ unsigned int *lock_mode)
{
struct xfs_inode *dp = args->dp; /* incore directory inode */
struct xfs_buf *bp; /* buffer for block */
@@ -146,7 +147,6 @@ xfs_dir2_block_getdents(
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
- int lock_mode;
unsigned int offset, next_offset;
unsigned int end;
@@ -156,12 +156,13 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
- lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(args->trans, dp, &bp);
- xfs_iunlock(dp, lock_mode);
if (error)
return error;
+ xfs_iunlock(dp, *lock_mode);
+ *lock_mode = 0;
+
/*
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
@@ -344,7 +345,8 @@ STATIC int
xfs_dir2_leaf_getdents(
struct xfs_da_args *args,
struct dir_context *ctx,
- size_t bufsize)
+ size_t bufsize,
+ unsigned int *lock_mode)
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
@@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents(
xfs_dir2_off_t curoff; /* current overall offset */
int length; /* temporary length value */
int byteoff; /* offset in current block */
- int lock_mode;
unsigned int offset = 0;
int error = 0; /* error return value */
@@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents(
bp = NULL;
}
- lock_mode = xfs_ilock_data_map_shared(dp);
+ if (*lock_mode == 0)
+ *lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
&rablk, &bp);
- xfs_iunlock(dp, lock_mode);
if (error || !bp)
break;
+ xfs_iunlock(dp, *lock_mode);
+ *lock_mode = 0;
+
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
@@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents(
*
* If supplied, the transaction collects locked dir buffers to avoid
* nested buffer deadlocks. This function does not dirty the
- * transaction. The caller should ensure that the inode is locked
+ * transaction. The caller must hold the IOLOCK (shared or exclusive)
* before calling this function.
*/
int
@@ -507,8 +511,9 @@ xfs_readdir(
size_t bufsize)
{
struct xfs_da_args args = { NULL };
- int rval;
- int v;
+ unsigned int lock_mode;
+ int isblock;
+ int error;
trace_xfs_readdir(dp);
@@ -516,6 +521,7 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
@@ -523,13 +529,22 @@ xfs_readdir(
args.trans = tp;
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(&args, ctx);
- else if ((rval = xfs_dir2_isblock(&args, &v)))
- ;
- else if (v)
- rval = xfs_dir2_block_getdents(&args, ctx);
- else
- rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+ return xfs_dir2_sf_getdents(&args, ctx);
- return rval;
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xfs_dir2_isblock(&args, &isblock);
+ if (error)
+ goto out_unlock;
+
+ if (isblock) {
+ error = xfs_dir2_block_getdents(&args, ctx, &lock_mode);
+ goto out_unlock;
+ }
+
+ error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
+
+out_unlock:
+ if (lock_mode)
+ xfs_iunlock(dp, lock_mode);
+ return error;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c15d61d47a06..5afedcbc78c7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -38,8 +38,8 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-struct kmem_zone *xfs_qm_dqtrxzone;
-static struct kmem_zone *xfs_qm_dqzone;
+struct kmem_cache *xfs_dqtrx_cache;
+static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
@@ -57,7 +57,7 @@ xfs_qm_dqdestroy(
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_cache_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_dquot_cache, dqp);
}
/*
@@ -289,13 +289,12 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
*/
STATIC int
xfs_dquot_disk_alloc(
- struct xfs_trans **tpp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
- struct xfs_trans *tp = *tpp;
- struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
xfs_dqtype_t qtype = xfs_dquot_type(dqp);
struct xfs_inode *quotip = xfs_quota_inode(mp, qtype);
@@ -304,29 +303,35 @@ xfs_dquot_disk_alloc(
trace_xfs_dqalloc(dqp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
+ return error;
+
xfs_ilock(quotip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, quotip, 0);
+
if (!xfs_this_quota_on(dqp->q_mount, qtype)) {
/*
* Return if this type of quotas is turned off while we didn't
* have an inode lock
*/
- xfs_iunlock(quotip, XFS_ILOCK_EXCL);
- return -ESRCH;
+ error = -ESRCH;
+ goto err_cancel;
}
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
-
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
- return error;
+ goto err_cancel;
/* Create the block mapping. */
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
&nmaps);
if (error)
- return error;
+ goto err_cancel;
+
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -341,7 +346,7 @@ xfs_dquot_disk_alloc(
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
if (error)
- return error;
+ goto err_cancel;
bp->b_ops = &xfs_dquot_buf_ops;
/*
@@ -371,16 +376,25 @@ xfs_dquot_disk_alloc(
* is responsible for unlocking any buffer passed back, either
* manually or by committing the transaction. On error, the buffer is
* released and not passed back.
+ *
+ * Keep the quota inode ILOCKed until after the transaction commit to
+ * maintain the atomicity of bmap/rmap updates.
*/
xfs_trans_bhold(tp, bp);
- error = xfs_defer_finish(tpp);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
if (error) {
- xfs_trans_bhold_release(*tpp, bp);
- xfs_trans_brelse(*tpp, bp);
+ xfs_buf_relse(bp);
return error;
}
+
*bpp = bp;
return 0;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+ return error;
}
/*
@@ -458,7 +472,7 @@ xfs_dquot_alloc(
{
struct xfs_dquot *dqp;
- dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL);
+ dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL);
dqp->q_type = type;
dqp->q_id = id;
@@ -471,7 +485,7 @@ xfs_dquot_alloc(
* Offset of dquot in the (fixed sized) dquot chunk.
*/
dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
/*
* Because we want to use a counting completion, complete
@@ -629,43 +643,6 @@ xfs_dquot_to_disk(
ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
}
-/* Allocate and initialize the dquot buffer for this in-core dquot. */
-static int
-xfs_qm_dqread_alloc(
- struct xfs_mount *mp,
- struct xfs_dquot *dqp,
- struct xfs_buf **bpp)
-{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
- if (error)
- goto err;
-
- error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
- if (error)
- goto err_cancel;
-
- error = xfs_trans_commit(tp);
- if (error) {
- /*
- * Buffer was held to the transaction, so we have to unlock it
- * manually here because we're not passing it back.
- */
- xfs_buf_relse(*bpp);
- *bpp = NULL;
- goto err;
- }
- return 0;
-
-err_cancel:
- xfs_trans_cancel(tp);
-err:
- return error;
-}
-
/*
* Read in the ondisk dquot using dqtobp() then copy it to an incore version,
* and release the buffer immediately. If @can_alloc is true, fill any
@@ -689,7 +666,7 @@ xfs_qm_dqread(
/* Try to read the buffer, allocating if necessary. */
error = xfs_dquot_disk_read(mp, dqp, &bp);
if (error == -ENOENT && can_alloc)
- error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+ error = xfs_dquot_disk_alloc(dqp, &bp);
if (error)
goto err;
@@ -1363,22 +1340,22 @@ xfs_dqlock2(
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ xfs_dquot_cache = kmem_cache_create("xfs_dquot",
sizeof(struct xfs_dquot),
0, 0, NULL);
- if (!xfs_qm_dqzone)
+ if (!xfs_dquot_cache)
goto out;
- xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx",
sizeof(struct xfs_dquot_acct),
0, 0, NULL);
- if (!xfs_qm_dqtrxzone)
- goto out_free_dqzone;
+ if (!xfs_dqtrx_cache)
+ goto out_free_dquot_cache;
return 0;
-out_free_dqzone:
- kmem_cache_destroy(xfs_qm_dqzone);
+out_free_dquot_cache:
+ kmem_cache_destroy(xfs_dquot_cache);
out:
return -ENOMEM;
}
@@ -1386,8 +1363,8 @@ out:
void
xfs_qm_exit(void)
{
- kmem_cache_destroy(xfs_qm_dqtrxzone);
- kmem_cache_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_dqtrx_cache);
+ kmem_cache_destroy(xfs_dquot_cache);
}
/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 81c445e9489b..749fd18c4f32 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,11 +213,12 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_errortag);
static struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_errortag_sysfs_ops,
- .default_attrs = xfs_errortag_attrs,
+ .default_groups = xfs_errortag_groups,
};
int
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3f8a0713573a..0e50f2c9348e 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_efi_zone;
-kmem_zone_t *xfs_efd_zone;
+struct kmem_cache *xfs_efi_cache;
+struct kmem_cache *xfs_efd_cache;
static const struct xfs_item_ops xfs_efi_item_ops;
@@ -43,7 +43,7 @@ xfs_efi_item_free(
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_cache_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_cache, efip);
}
/*
@@ -161,7 +161,7 @@ xfs_efi_init(
((nextents - 1) * sizeof(xfs_extent_t)));
efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_cache_zalloc(xfs_efi_zone,
+ efip = kmem_cache_zalloc(xfs_efi_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -241,7 +241,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_cache_free(xfs_efd_zone, efdp);
+ kmem_cache_free(xfs_efd_cache, efdp);
}
/*
@@ -333,7 +333,7 @@ xfs_trans_get_efd(
(nextents - 1) * sizeof(struct xfs_extent),
0);
} else {
- efdp = kmem_cache_zalloc(xfs_efd_zone,
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -474,15 +474,21 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *free;
int error;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ oinfo.oi_owner = free->xefi_owner;
+ if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
error = xfs_trans_free_extent(tp, EFD_ITEM(done),
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
+ &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -502,7 +508,7 @@ xfs_extent_free_cancel_item(
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -525,6 +531,7 @@ xfs_agfl_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_extent_free_item *free;
@@ -539,13 +546,13 @@ xfs_agfl_free_finish_item(
ASSERT(free->xefi_blockcount == 1);
agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ oinfo.oi_owner = free->xefi_owner;
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -564,7 +571,7 @@ xfs_agfl_free_finish_item(
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -597,7 +604,7 @@ xfs_efi_item_recover(
struct list_head *capture_list)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
@@ -637,7 +644,7 @@ xfs_efi_item_recover(
}
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index cd2860c875bf..186d0f2137f1 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -9,7 +9,7 @@
/* kernel only EFI/EFD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
*/
#define XFS_EFD_MAX_FAST_EXTENTS 16
-extern struct kmem_zone *xfs_efi_zone;
-extern struct kmem_zone *xfs_efd_zone;
+extern struct kmem_cache *xfs_efi_cache;
+extern struct kmem_cache *xfs_efd_cache;
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..5bddb1e9e0b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -66,40 +66,6 @@ xfs_is_falloc_aligned(
return !((pos | len) & mask);
}
-int
-xfs_update_prealloc_flags(
- struct xfs_inode *ip,
- enum xfs_prealloc_flags flags)
-{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
- 0, 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- VFS_I(ip)->i_mode &= ~S_ISUID;
- if (VFS_I(ip)->i_mode & S_IXGRP)
- VFS_I(ip)->i_mode &= ~S_ISGID;
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
-
- if (flags & XFS_PREALLOC_SET)
- ip->i_diflags |= XFS_DIFLAG_PREALLOC;
- if (flags & XFS_PREALLOC_CLEAR)
- ip->i_diflags &= ~XFS_DIFLAG_PREALLOC;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (flags & XFS_PREALLOC_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
/*
* Fsync operations on directories are much simpler than on regular files,
* as there is no file data to flush, and thus also no need for explicit
@@ -259,7 +225,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -437,8 +403,7 @@ restart:
}
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
} else
@@ -569,7 +534,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0);
+ &xfs_dio_write_ops, 0, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -647,7 +612,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags);
+ &xfs_dio_write_ops, flags, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
@@ -896,6 +861,21 @@ xfs_break_layouts(
return error;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -911,7 +891,6 @@ xfs_file_fallocate(
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
- enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
@@ -956,6 +935,10 @@ xfs_file_fallocate(
goto out_unlock;
}
+ error = file_modified(file);
+ if (error)
+ goto out_unlock;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -1005,8 +988,6 @@ xfs_file_fallocate(
}
do_file_insert = true;
} else {
- flags |= XFS_PREALLOC_SET;
-
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
@@ -1052,20 +1033,12 @@ xfs_file_fallocate(
}
if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
+ error = xfs_alloc_file_space(ip, offset, len);
if (error)
goto out_unlock;
}
}
- if (file->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
-
- error = xfs_update_prealloc_flags(ip, flags);
- if (error)
- goto out_unlock;
-
/* Change file size if needed */
if (new_size) {
struct iattr iattr;
@@ -1084,8 +1057,14 @@ xfs_file_fallocate(
* leave shifted extents past EOF and hence losing access to
* the data that is contained within them.
*/
- if (do_file_insert)
+ if (do_file_insert) {
error = xfs_insert_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_file_sync_writes(file))
+ error = xfs_log_force_inode(ip);
out_unlock:
xfs_iunlock(ip, iolock);
@@ -1117,21 +1096,6 @@ xfs_file_fadvise(
return ret;
}
-/* Does this file, inode, or mount want synchronous writes? */
-static inline bool xfs_file_sync_writes(struct file *filp)
-{
- struct xfs_inode *ip = XFS_I(file_inode(filp));
-
- if (xfs_has_wsync(ip->i_mount))
- return true;
- if (filp->f_flags & (__O_SYNC | O_DSYNC))
- return true;
- if (IS_SYNC(file_inode(filp)))
- return true;
-
- return false;
-}
-
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
@@ -1452,7 +1416,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 48287caad28b..10e1cb71439e 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -864,8 +864,8 @@ xfs_getfsmap(
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
- use_rmap = capable(CAP_SYS_ADMIN) &&
- xfs_has_rmapbt(mp);
+ use_rmap = xfs_has_rmapbt(mp) &&
+ has_capability_noaudit(current, CAP_SYS_ADMIN);
head->fmh_entries = 0;
/* Set up our device handlers. */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 33e26690a8c4..68f74549fa22 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,6 +17,7 @@
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trace.h"
@@ -347,7 +348,7 @@ xfs_fs_counts(
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
+ xfs_fdblocks_unavailable(mp);
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
@@ -430,46 +431,36 @@ xfs_reserve_blocks(
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
+ *
+ * The code below estimates how many blocks it can request from
+ * fdblocks to stash in the reserve pool. This is a classic TOCTOU
+ * race since fdblocks updates are not always coordinated via
+ * m_sb_lock. Set the reserve size even if there's not enough free
+ * space to fill it because mod_fdblocks will refill an undersized
+ * reserve when it can.
*/
- error = -ENOSPC;
- do {
- free = percpu_counter_sum(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
- if (free <= 0)
- break;
-
- delta = request - mp->m_resblks;
- lcounter = free - delta;
- if (lcounter < 0)
- /* We can't satisfy the request, just get what we can */
- fdblks_delta = free;
- else
- fdblks_delta = delta;
-
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp);
+ delta = request - mp->m_resblks;
+ mp->m_resblks = request;
+ if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
- * count or we'll get an ENOSPC. If we get a ENOSPC, it means
- * things changed while we were calculating fdblks_delta and so
- * we should try again to see if there is anything left to
- * reserve.
+ * count or we'll get an ENOSPC. Don't set the reserved flag
+ * here - we don't want to reserve the extra reserve blocks
+ * from the reserve.
*
- * Don't set the reserved flag here - we don't want to reserve
- * the extra reserve blocks from the reserve.....
+ * The desired reserve size can change after we drop the lock.
+ * Use mod_fdblocks to put the space into the reserve or into
+ * fdblocks as appropriate.
*/
+ fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ if (!error)
+ xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
- } while (error == -ENOSPC);
-
- /*
- * Update the reserve counters if blocks have been successfully
- * allocated.
- */
- if (!error && fdblks_delta) {
- mp->m_resblks += fdblks_delta;
- mp->m_resblks_avail += fdblks_delta;
}
-
out:
if (outval) {
outval->resblks = mp->m_resblks;
@@ -528,8 +519,11 @@ xfs_do_force_shutdown(
int tag;
const char *why;
- if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate))
+
+ if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+ xlog_shutdown_wait(mp->m_log);
return;
+ }
if (mp->m_sb_bp)
mp->m_sb_bp->b_flags |= XBF_DONE;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f2210d927481..bffd6eb0b298 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -23,6 +23,7 @@
#include "xfs_reflink.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
#include <linux/iversion.h>
@@ -77,16 +78,17 @@ xfs_inode_alloc(
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
* and return NULL here on ENOMEM.
*/
- ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
return NULL;
}
/* VFS doesn't initialise i_mode or i_state! */
VFS_I(ip)->i_mode = 0;
VFS_I(ip)->i_state = 0;
+ mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -130,11 +132,11 @@ xfs_inode_free_callback(
if (ip->i_afp) {
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
}
if (ip->i_cowfp) {
xfs_idestroy_fork(ip->i_cowfp);
- kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
}
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
@@ -143,7 +145,7 @@ xfs_inode_free_callback(
ip->i_itemp = NULL;
}
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
}
static void
@@ -289,22 +291,6 @@ xfs_perag_clear_inode_tag(
trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
-static inline void
-xfs_inew_wait(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
-
- do {
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (!xfs_iflags_test(ip, XFS_INEW))
- break;
- schedule();
- } while (true);
- finish_wait(wq, &wait.wq_entry);
-}
-
/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
@@ -336,6 +322,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
+ mapping_set_large_folios(inode->i_mapping);
return error;
}
@@ -368,18 +355,13 @@ xfs_iget_recycle(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
if (error) {
- bool wake;
-
/*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
rcu_read_lock();
spin_lock(&ip->i_flags_lock);
- wake = !!__xfs_iflags_test(ip, XFS_INEW);
ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
- if (wake)
- wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
@@ -770,7 +752,8 @@ again:
/*
* If we have a real type for an on-disk inode, we can setup the inode
- * now. If it's a new inode being created, xfs_ialloc will handle it.
+ * now. If it's a new inode being created, xfs_init_new_inode will
+ * handle it.
*/
if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
xfs_setup_existing_inode(ip);
@@ -891,9 +874,16 @@ xfs_reclaim_inode(
if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
- if (xfs_is_shutdown(ip->i_mount)) {
+ /*
+ * Check for log shutdown because aborting the inode can move the log
+ * tail and corrupt in memory state. This is fine if the log is shut
+ * down, but if the log is still active and only the mount is shut down
+ * then the in-memory log tail movement caused by the abort can be
+ * incorrectly propagated to disk.
+ */
+ if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
- xfs_iflush_abort(ip);
+ xfs_iflush_shutdown_abort(ip);
goto reclaim;
}
if (xfs_ipincount(ip))
@@ -1872,28 +1862,20 @@ xfs_inodegc_worker(
}
/*
- * Force all currently queued inode inactivation work to run immediately, and
- * wait for the work to finish. Two pass - queue all the work first pass, wait
- * for it in a second pass.
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
*/
void
xfs_inodegc_flush(
struct xfs_mount *mp)
{
- struct xfs_inodegc *gc;
- int cpu;
-
if (!xfs_is_inodegc_enabled(mp))
return;
trace_xfs_inodegc_flush(mp, __return_address);
xfs_inodegc_queue_all(mp);
-
- for_each_online_cpu(cpu) {
- gc = per_cpu_ptr(mp->m_inodegc, cpu);
- flush_work(&gc->work);
- }
+ flush_workqueue(mp->m_inodegc_wq);
}
/*
@@ -1904,18 +1886,12 @@ void
xfs_inodegc_stop(
struct xfs_mount *mp)
{
- struct xfs_inodegc *gc;
- int cpu;
-
if (!xfs_clear_inodegc_enabled(mp))
return;
xfs_inodegc_queue_all(mp);
+ drain_workqueue(mp->m_inodegc_wq);
- for_each_online_cpu(cpu) {
- gc = per_cpu_ptr(mp->m_inodegc, cpu);
- cancel_work_sync(&gc->work);
- }
trace_xfs_inodegc_stop(mp, __return_address);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 017904a34c02..508e184e3b8f 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,7 +20,7 @@
#include "xfs_ialloc.h"
#include "xfs_trace.h"
-kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+struct kmem_cache *xfs_icreate_cache; /* inode create item */
static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
{
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
@@ -97,7 +97,7 @@ xfs_icreate_log(
{
struct xfs_icreate_item *icp;
- icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL);
+ icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index a50d0b01e15a..64992823108a 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -12,7 +12,7 @@ struct xfs_icreate_item {
struct xfs_icreate_log ic_format;
};
-extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+extern struct kmem_cache *xfs_icreate_cache; /* inode create item */
void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t agbno, unsigned int count,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4f6f034fb81..39ae53efb3ab 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,8 +35,9 @@
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
-kmem_zone_t *xfs_inode_zone;
+struct kmem_cache *xfs_inode_cache;
/*
* Used in xfs_itruncate_extents(). This is the maximum number of extents
@@ -564,8 +565,6 @@ xfs_lock_two_inodes(
struct xfs_inode *ip1,
uint ip1_mode)
{
- struct xfs_inode *temp;
- uint mode_temp;
int attempts = 0;
struct xfs_log_item *lp;
@@ -578,12 +577,8 @@ xfs_lock_two_inodes(
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
- temp = ip0;
- ip0 = ip1;
- ip1 = temp;
- mode_temp = ip0_mode;
- ip0_mode = ip1_mode;
- ip1_mode = mode_temp;
+ swap(ip0, ip1);
+ swap(ip0_mode, ip1_mode);
}
again:
@@ -664,9 +659,9 @@ xfs_ip2xflags(
*/
int
xfs_lookup(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t **ipp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ struct xfs_inode **ipp,
struct xfs_name *ci_name)
{
xfs_ino_t inum;
@@ -994,8 +989,8 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
- mapped_fsgid(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1148,8 +1143,8 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
- mapped_fsgid(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1223,7 +1218,7 @@ xfs_link(
{
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
- int error;
+ int error, nospace_error = 0;
int resblks;
trace_xfs_link(tdp, target_name);
@@ -1242,19 +1237,11 @@ xfs_link(
goto std_return;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
- }
+ error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+ &tp, &nospace_error);
if (error)
goto std_return;
- xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
@@ -1312,6 +1299,8 @@ xfs_link(
error_return:
xfs_trans_cancel(tp);
std_return:
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -2605,14 +2594,13 @@ xfs_ifree_cluster(
}
/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
@@ -2634,13 +2622,16 @@ xfs_ifree(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, pag, ip);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- goto out;
+ return error;
- error = xfs_difree(tp, pag, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
goto out;
@@ -2761,6 +2752,7 @@ xfs_remove(
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+ int dontcare;
int error = 0;
uint resblks;
@@ -2778,31 +2770,24 @@ xfs_remove(
goto std_return;
/*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
+ * We try to get the real space reservation first, allowing for
+ * directory btree deletion(s) implying possible bmap insert(s). If we
+ * can't get the space reservation then we use 0 instead, and avoid the
+ * bmap btree insert(s) in the directory code by, if the bmap insert
+ * tries to happen, instead trimming the LAST block from the directory.
+ *
+ * Ignore EDQUOT and ENOSPC being returned via nospace_error because
+ * the directory code can handle a reservationless update and we don't
+ * want to prevent a user from trying to free space by deleting things.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
- &tp);
- }
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+ &tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
goto std_return;
}
- xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
/*
* If we're removing a directory perform some additional validation.
*/
@@ -3115,7 +3100,8 @@ xfs_rename(
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
- int error;
+ bool retried = false;
+ int error, nospace_error = 0;
trace_xfs_rename(src_dp, target_dp, src_name, target_name);
@@ -3128,7 +3114,6 @@ xfs_rename(
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
if (error)
return error;
@@ -3140,9 +3125,12 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+retry:
+ nospace_error = 0;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
+ nospace_error = error;
spaceres = 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
&tp);
@@ -3197,6 +3185,31 @@ xfs_rename(
spaceres);
/*
+ * Try to reserve quota to handle an expansion of the target directory.
+ * We'll allow the rename to continue in reservationless mode if we hit
+ * a space usage constraint. If we trigger reservationless mode, save
+ * the errno if there isn't any free space in the target directory.
+ */
+ if (spaceres != 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
+ 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(target_dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ nospace_error = error;
+ spaceres = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*
@@ -3442,6 +3455,8 @@ out_trans_cancel:
out_release_wip:
if (wip)
xfs_irele(wip);
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -3618,7 +3633,7 @@ xfs_iflush_cluster(
/*
* We must use the safe variant here as on shutdown xfs_iflush_abort()
- * can remove itself from the list.
+ * will remove itself from the list.
*/
list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
iip = (struct xfs_inode_log_item *)lip;
@@ -3666,7 +3681,7 @@ xfs_iflush_cluster(
* AIL, leaving a dirty/unpinned inode attached to the buffer
* that otherwise looks like it should be flushed.
*/
- if (xfs_is_shutdown(mp)) {
+ if (xlog_is_shutdown(mp->m_log)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -3692,9 +3707,19 @@ xfs_iflush_cluster(
}
if (error) {
+ /*
+ * Shutdown first so we kill the log before we release this
+ * buffer. If it is an INODE_ALLOC buffer and pins the tail
+ * of the log, failing it before the _log_ is shut down can
+ * result in the log tail being moved forward in the journal
+ * on disk because log writes can still be taking place. Hence
+ * unpinning the tail will allow the ICREATE intent to be
+ * removed from the log an recovery will fail with uninitialised
+ * inode cluster buffers.
+ */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b21b177832d1..740ab13d1aa2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -231,8 +231,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
-#define __XFS_INEW_BIT 3 /* inode has just been allocated */
-#define XFS_INEW (1 << __XFS_INEW_BIT)
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
#define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
@@ -403,7 +402,7 @@ enum layout_break_reason {
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct user_namespace *mnt_userns,
struct xfs_inode *dp, struct xfs_name *name,
@@ -463,15 +462,6 @@ xfs_itruncate_extents(
}
/* from xfs_file.c */
-enum xfs_prealloc_flags {
- XFS_PREALLOC_SET = (1 << 1),
- XFS_PREALLOC_CLEAR = (1 << 2),
- XFS_PREALLOC_SYNC = (1 << 3),
- XFS_PREALLOC_INVISIBLE = (1 << 4),
-};
-
-int xfs_update_prealloc_flags(struct xfs_inode *ip,
- enum xfs_prealloc_flags flags);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
@@ -492,7 +482,6 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
xfs_iflags_clear(ip, XFS_INEW);
barrier();
unlock_new_inode(VFS_I(ip));
- wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
}
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
@@ -504,7 +493,7 @@ static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
void xfs_irele(struct xfs_inode *ip);
-extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_cache *xfs_inode_cache;
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0659d19c211e..9e6ef55cf29e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,11 +17,12 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include <linux/iversion.h>
-kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+struct kmem_cache *xfs_ili_cache; /* inode log item */
static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
{
@@ -543,10 +544,17 @@ xfs_inode_item_push(
uint rval = XFS_ITEM_SUCCESS;
int error;
- ASSERT(iip->ili_item.li_buf);
+ if (!bp || (ip->i_flags & XFS_ISTALE)) {
+ /*
+ * Inode item/buffer is being being aborted due to cluster
+ * buffer deletion. Trigger a log force to have that operation
+ * completed and items removed from the AIL before the next push
+ * attempt.
+ */
+ return XFS_ITEM_PINNED;
+ }
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
- (ip->i_flags & XFS_ISTALE))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
if (xfs_iflags_test(ip, XFS_IFLUSHING))
@@ -672,7 +680,7 @@ xfs_inode_item_init(
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
+ iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
@@ -694,7 +702,7 @@ xfs_inode_item_destroy(
ip->i_itemp = NULL;
kmem_free(iip->ili_item.li_lv_shadow);
- kmem_cache_free(xfs_ili_zone, iip);
+ kmem_cache_free(xfs_ili_cache, iip);
}
@@ -720,6 +728,17 @@ xfs_iflush_ail_updates(
if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
continue;
+ /*
+ * dgc: Not sure how this happens, but it happens very
+ * occassionaly via generic/388. xfs_iflush_abort() also
+ * silently handles this same "under writeback but not in AIL at
+ * shutdown" condition via xfs_trans_ail_delete().
+ */
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+ ASSERT(xlog_is_shutdown(lip->li_log));
+ continue;
+ }
+
lsn = xfs_ail_delete_one(ailp, lip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
@@ -822,46 +841,143 @@ xfs_buf_inode_io_fail(
}
/*
- * This is the inode flushing abort routine. It is called when
- * the filesystem is shutting down to clean up the inode state. It is
- * responsible for removing the inode item from the AIL if it has not been
- * re-logged and clearing the inode's flush state.
+ * Clear the inode logging fields so no more flushes are attempted. If we are
+ * on a buffer list, it is now safe to remove it because the buffer is
+ * guaranteed to be locked. The caller will drop the reference to the buffer
+ * the log item held.
+ */
+static void
+xfs_iflush_abort_clean(
+ struct xfs_inode_log_item *iip)
+{
+ iip->ili_last_fields = 0;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ iip->ili_flush_lsn = 0;
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&iip->ili_item.li_bio_list);
+}
+
+/*
+ * Abort flushing the inode from a context holding the cluster buffer locked.
+ *
+ * This is the normal runtime method of aborting writeback of an inode that is
+ * attached to a cluster buffer. It occurs when the inode and the backing
+ * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster
+ * flushing or buffer IO completion encounters a log shutdown situation.
+ *
+ * If we need to abort inode writeback and we don't already hold the buffer
+ * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be
+ * necessary in a shutdown situation.
*/
void
xfs_iflush_abort(
struct xfs_inode *ip)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
- struct xfs_buf *bp = NULL;
+ struct xfs_buf *bp;
- if (iip) {
- /*
- * Clear the failed bit before removing the item from the AIL so
- * xfs_trans_ail_delete() doesn't try to clear and release the
- * buffer attached to the log item before we are done with it.
- */
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
- xfs_trans_ail_delete(&iip->ili_item, 0);
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
+ }
+
+ /*
+ * Remove the inode item from the AIL before we clear its internal
+ * state. Whilst the inode is in the AIL, it should have a valid buffer
+ * pointer for push operations to access - it is only safe to remove the
+ * inode from the buffer once it has been removed from the AIL.
+ *
+ * We also clear the failed bit before removing the item from the AIL
+ * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
+ * references the inode item owns and needs to hold until we've fully
+ * aborted the inode log item and detached it from the buffer.
+ */
+ clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+ xfs_trans_ail_delete(&iip->ili_item, 0);
+
+ /*
+ * Grab the inode buffer so can we release the reference the inode log
+ * item holds on it.
+ */
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ xfs_iflush_abort_clean(iip);
+ spin_unlock(&iip->ili_lock);
+
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ if (bp)
+ xfs_buf_rele(bp);
+}
+/*
+ * Abort an inode flush in the case of a shutdown filesystem. This can be called
+ * from anywhere with just an inode reference and does not require holding the
+ * inode cluster buffer locked. If the inode is attached to a cluster buffer,
+ * it will grab and lock it safely, then abort the inode flush.
+ */
+void
+xfs_iflush_shutdown_abort(
+ struct xfs_inode *ip)
+{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_buf *bp;
+
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
+ }
+
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ if (!bp) {
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ return;
+ }
+
+ /*
+ * We have to take a reference to the buffer so that it doesn't get
+ * freed when we drop the ili_lock and then wait to lock the buffer.
+ * We'll clean up the extra reference after we pick up the ili_lock
+ * again.
+ */
+ xfs_buf_hold(bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_buf_lock(bp);
+
+ spin_lock(&iip->ili_lock);
+ if (!iip->ili_item.li_buf) {
/*
- * Clear the inode logging fields so no more flushes are
- * attempted.
+ * Raced with another removal, hold the only reference
+ * to bp now. Inode should not be in the AIL now, so just clean
+ * up and return;
*/
- spin_lock(&iip->ili_lock);
- iip->ili_last_fields = 0;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_flush_lsn = 0;
- bp = iip->ili_item.li_buf;
- iip->ili_item.li_buf = NULL;
- list_del_init(&iip->ili_item.li_bio_list);
+ ASSERT(list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags));
+ xfs_iflush_abort_clean(iip);
spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_buf_relse(bp);
+ return;
}
- xfs_iflags_clear(ip, XFS_IFLUSHING);
- if (bp)
- xfs_buf_rele(bp);
+
+ /*
+ * Got two references to bp. The first will get dropped by
+ * xfs_iflush_abort() when the item is removed from the buffer list, but
+ * we can't drop our reference until _abort() returns because we have to
+ * unlock the buffer as well. Hence we abort and then unlock and release
+ * our reference to the buffer.
+ */
+ ASSERT(iip->ili_item.li_buf == bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ xfs_buf_relse(bp);
}
+
/*
* convert an xfs_inode_log_format struct from the old 32 bit version
* (which can have different field alignments) to the native 64 bit version
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 403b45ab9aa2..bbd836a44ff0 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -44,9 +44,10 @@ static inline int xfs_inode_clean(struct xfs_inode *ip)
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
-extern struct kmem_zone *xfs_ili_zone;
+extern struct kmem_cache *xfs_ili_cache;
#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0c795dc093ef..83481005317a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -372,7 +372,7 @@ int
xfs_ioc_attr_list(
struct xfs_inode *dp,
void __user *ubuf,
- int bufsize,
+ size_t bufsize,
int flags,
struct xfs_attrlist_cursor __user *ucursor)
{
@@ -627,86 +627,6 @@ xfs_attrmulti_by_handle(
return error;
}
-int
-xfs_ioc_space(
- struct file *filp,
- xfs_flock64_t *bf)
-{
- struct inode *inode = file_inode(filp);
- struct xfs_inode *ip = XFS_I(inode);
- struct iattr iattr;
- enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR;
- uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- int error;
-
- if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
- return -EPERM;
-
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
- if (xfs_is_always_cow_inode(ip))
- return -EOPNOTSUPP;
-
- if (filp->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
- if (filp->f_mode & FMODE_NOCMTIME)
- flags |= XFS_PREALLOC_INVISIBLE;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
- if (error)
- goto out_unlock;
- inode_dio_wait(inode);
-
- switch (bf->l_whence) {
- case 0: /*SEEK_SET*/
- break;
- case 1: /*SEEK_CUR*/
- bf->l_start += filp->f_pos;
- break;
- case 2: /*SEEK_END*/
- bf->l_start += XFS_ISIZE(ip);
- break;
- default:
- error = -EINVAL;
- goto out_unlock;
- }
-
- if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- if (bf->l_start > XFS_ISIZE(ip)) {
- error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
- bf->l_start - XFS_ISIZE(ip), 0);
- if (error)
- goto out_unlock;
- }
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = bf->l_start;
- error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp),
- &iattr);
- if (error)
- goto out_unlock;
-
- error = xfs_update_prealloc_flags(ip, flags);
-
-out_unlock:
- xfs_iunlock(ip, iolock);
- mnt_drop_write_file(filp);
- return error;
-}
-
/* Return 0 on success or positive error */
int
xfs_fsbulkstat_one_fmt(
@@ -1269,7 +1189,7 @@ xfs_ioctl_setattr_get_trans(
goto out_error;
error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_error;
@@ -1544,10 +1464,10 @@ xfs_ioc_getbmap(
if (bmx.bmv_count < 2)
return -EINVAL;
- if (bmx.bmv_count > ULONG_MAX / recsize)
+ if (bmx.bmv_count >= INT_MAX / recsize)
return -ENOMEM;
- buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
+ buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1601,11 +1521,11 @@ xfs_ioc_getfsmap(
*/
count = min_t(unsigned int, head.fmh_count,
131072 / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs) {
count = min_t(unsigned int, head.fmh_count,
PAGE_SIZE / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs)
return -ENOMEM;
}
@@ -1935,6 +1855,15 @@ xfs_fs_eofblocks_from_user(
}
/*
+ * These long-unused ioctls were removed from the official ioctl API in 5.17,
+ * but retain these definitions so that we can log warnings about them.
+ */
+#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
+
+/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
* So we don't "sign flip" like most other routines. This means
@@ -1964,13 +1893,11 @@ xfs_file_ioctl(
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64: {
- xfs_flock64_t bf;
-
- if (copy_from_user(&bf, arg, sizeof(bf)))
- return -EFAULT;
- return xfs_ioc_space(filp, &bf);
- }
+ case XFS_IOC_FREESP64:
+ xfs_warn_once(mp,
+ "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported",
+ current->comm);
+ return -ENOTTY;
case XFS_IOC_DIOINFO: {
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
struct dioattr da;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 28453a6d4461..d4abba2c13c1 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -10,12 +10,6 @@ struct xfs_bstat;
struct xfs_ibulk;
struct xfs_inogrp;
-
-extern int
-xfs_ioc_space(
- struct file *filp,
- xfs_flock64_t *bf);
-
int
xfs_ioc_swapext(
xfs_swapext_t *sxp);
@@ -38,8 +32,9 @@ xfs_readlink_by_handle(
int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
uint32_t opcode, void __user *uname, void __user *value,
uint32_t *len, uint32_t flags);
-int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize,
- int flags, struct xfs_attrlist_cursor __user *ucursor);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+ size_t bufsize, int flags,
+ struct xfs_attrlist_cursor __user *ucursor);
extern struct dentry *
xfs_handle_to_dentry(
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 8783af203cfc..ca25ed89b706 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -28,22 +28,6 @@
#ifdef BROKEN_X86_ALIGNMENT
STATIC int
-xfs_compat_flock64_copyin(
- xfs_flock64_t *bf,
- compat_xfs_flock64_t __user *arg32)
-{
- if (get_user(bf->l_type, &arg32->l_type) ||
- get_user(bf->l_whence, &arg32->l_whence) ||
- get_user(bf->l_start, &arg32->l_start) ||
- get_user(bf->l_len, &arg32->l_len) ||
- get_user(bf->l_sysid, &arg32->l_sysid) ||
- get_user(bf->l_pid, &arg32->l_pid) ||
- copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
xfs_compat_ioc_fsgeometry_v1(
struct xfs_mount *mp,
compat_xfs_fsop_geom_v1_t __user *arg32)
@@ -233,7 +217,7 @@ xfs_compat_ioc_fsbulkstat(
inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat;
bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat;
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall()) {
/*
* ... but on x32 the input xfs_fsop_bulkreq has pointers
@@ -445,17 +429,6 @@ xfs_file_compat_ioctl(
switch (cmd) {
#if defined(BROKEN_X86_ALIGNMENT)
- case XFS_IOC_ALLOCSP_32:
- case XFS_IOC_FREESP_32:
- case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32: {
- struct xfs_flock64 bf;
-
- if (xfs_compat_flock64_copyin(&bf, arg))
- return -EFAULT;
- cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(filp, &bf);
- }
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg);
case XFS_IOC_FSGROWFSDATA_32: {
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 9929482bf358..c14852362fce 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -142,28 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq {
_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
#ifdef BROKEN_X86_ALIGNMENT
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct compat_xfs_flock64 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-} compat_xfs_flock64_t;
-
-#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64)
-#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
-#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
-
typedef struct compat_xfs_fsop_geom_v1 {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 093758440ad5..e552ce541ec2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -28,7 +28,6 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
-
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -54,7 +53,8 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- u16 flags)
+ unsigned int mapping_flags,
+ u16 iomap_flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -71,16 +71,22 @@ xfs_bmbt_to_iomap(
iomap->type = IOMAP_DELALLOC;
} else {
iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
+ if (mapping_flags & IOMAP_DAX)
+ iomap->addr += target->bt_dax_part_off;
+
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
iomap->type = IOMAP_MAPPED;
+
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = target->bt_bdev;
- iomap->dax_dev = target->bt_daxdev;
- iomap->flags = flags;
+ if (mapping_flags & IOMAP_DAX)
+ iomap->dax_dev = target->bt_daxdev;
+ else
+ iomap->bdev = target->bt_bdev;
+ iomap->flags = iomap_flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
@@ -188,6 +194,7 @@ xfs_iomap_write_direct(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
+ unsigned int flags,
struct xfs_bmbt_irec *imap)
{
struct xfs_mount *mp = ip->i_mount;
@@ -229,7 +236,7 @@ xfs_iomap_write_direct(
* the reserve block pool for bmbt block allocation if there is no space
* left but we need to do unwritten extent conversion.
*/
- if (IS_DAX(VFS_I(ip))) {
+ if (flags & IOMAP_DAX) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (imap->br_state == XFS_EXT_UNWRITTEN) {
force = true;
@@ -620,7 +627,7 @@ imap_needs_alloc(
imap->br_startblock == DELAYSTARTBLOCK)
return true;
/* we convert unwritten extents before copying the data for DAX */
- if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+ if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
return true;
return false;
}
@@ -800,7 +807,7 @@ xfs_direct_write_iomap_begin(
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
allocate_blocks:
error = -EAGAIN;
@@ -826,23 +833,24 @@ allocate_blocks:
xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- &imap);
+ flags, &imap);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ iomap_flags | IOMAP_F_NEW);
out_found_cow:
xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
out_unlock:
if (lockmode)
@@ -1052,23 +1060,24 @@ retry:
*/
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
found_imap:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
found_cow:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1177,7 +1186,8 @@ xfs_read_iomap_begin(
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ shared ? IOMAP_F_SHARED : 0);
}
const struct iomap_ops xfs_read_iomap_ops = {
@@ -1236,7 +1246,8 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1258,7 +1269,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1305,9 +1316,40 @@ out_unlock:
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
.iomap_begin = xfs_xattr_iomap_begin,
};
+
+int
+xfs_zero_range(
+ struct xfs_inode *ip,
+ loff_t pos,
+ loff_t len,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(inode))
+ return dax_zero_range(inode, pos, len, did_zero,
+ &xfs_direct_write_iomap_ops);
+ return iomap_zero_range(inode, pos, len, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_truncate_page(
+ struct xfs_inode *ip,
+ loff_t pos,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(inode))
+ return dax_truncate_page(inode, pos, did_zero,
+ &xfs_direct_write_iomap_ops);
+ return iomap_truncate_page(inode, pos, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e..e88dc162c785 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -12,13 +12,19 @@ struct xfs_inode;
struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
- xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
+ xfs_fileoff_t count_fsb, unsigned int flags,
+ struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
-int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, u16);
+int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+ struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
+ u16 iomap_flags);
+
+int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+ bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a607d6aca5c4..b34e8e4344a8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -511,27 +511,6 @@ xfs_vn_get_link(
return ERR_PTR(error);
}
-STATIC const char *
-xfs_vn_get_link_inline(
- struct dentry *dentry,
- struct inode *inode,
- struct delayed_call *done)
-{
- struct xfs_inode *ip = XFS_I(inode);
- char *link;
-
- ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL);
-
- /*
- * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
- * if_data is junk.
- */
- link = ip->i_df.if_u1.if_data;
- if (XFS_IS_CORRUPT(ip->i_mount, !link))
- return ERR_PTR(-EFSCORRUPTED);
- return link;
-}
-
static uint32_t
xfs_stat_blksize(
struct xfs_inode *ip)
@@ -634,37 +613,6 @@ xfs_vn_getattr(
return 0;
}
-static void
-xfs_setattr_mode(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
- umode_t mode = iattr->ia_mode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
-}
-
-void
-xfs_setattr_time(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (iattr->ia_valid & ATTR_ATIME)
- inode->i_atime = iattr->ia_atime;
- if (iattr->ia_valid & ATTR_CTIME)
- inode->i_ctime = iattr->ia_ctime;
- if (iattr->ia_valid & ATTR_MTIME)
- inode->i_mtime = iattr->ia_mtime;
-}
-
static int
xfs_vn_change_ok(
struct user_namespace *mnt_userns,
@@ -699,10 +647,10 @@ xfs_setattr_nonsize(
int mask = iattr->ia_valid;
xfs_trans_t *tp;
int error;
- kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
- kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
- struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
+ struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL;
ASSERT((mask & ATTR_SIZE) == 0);
@@ -744,66 +692,30 @@ xfs_setattr_nonsize(
}
error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
/*
- * Change file ownership. Must be the owner or privileged.
+ * Register quota modifications in the transaction. Must be the owner
+ * or privileged. These IDs could have changed since we last looked at
+ * them. But, we're assured that if the ownership did change while we
+ * didn't have the inode locked, inode's dquot(s) would have changed
+ * also.
*/
- if (mask & (ATTR_UID|ATTR_GID)) {
- /*
- * These IDs could have changed since we last looked at them.
- * But, we're assured that if the ownership did change
- * while we didn't have the inode locked, inode's dquot(s)
- * would have changed also.
- */
- iuid = inode->i_uid;
- igid = inode->i_gid;
- gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
- uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-
- /*
- * CAP_FSETID overrides the following restrictions:
- *
- * The set-user-ID and set-group-ID bits of a file will be
- * cleared upon successful return from chown()
- */
- if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
- !capable(CAP_FSETID))
- inode->i_mode &= ~(S_ISUID|S_ISGID);
-
- /*
- * Change the ownerships and register quota modifications
- * in the transaction.
- */
- if (!uid_eq(iuid, uid)) {
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(mask & ATTR_UID);
- ASSERT(udqp);
- olddquot1 = xfs_qm_vop_chown(tp, ip,
- &ip->i_udquot, udqp);
- }
- inode->i_uid = uid;
- }
- if (!gid_eq(igid, gid)) {
- if (XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(xfs_has_pquotino(mp) ||
- !XFS_IS_PQUOTA_ON(mp));
- ASSERT(mask & ATTR_GID);
- ASSERT(gdqp);
- olddquot2 = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
- }
- inode->i_gid = gid;
- }
+ if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
+ !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ ASSERT(udqp);
+ old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
+ }
+ if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
+ !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
+ ASSERT(gdqp);
+ old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
}
- if (mask & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
@@ -815,8 +727,8 @@ xfs_setattr_nonsize(
/*
* Release any dquot(s) the inode had kept before chown.
*/
- xfs_qm_dqrele(olddquot1);
- xfs_qm_dqrele(olddquot2);
+ xfs_qm_dqrele(old_udqp);
+ xfs_qm_dqrele(old_gdqp);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -911,8 +823,8 @@ xfs_setattr_size(
*/
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
- error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, oldsize, newsize - oldsize,
+ &did_zeroing);
} else {
/*
* iomap won't detect a dirty page over an unwritten block (or a
@@ -924,8 +836,7 @@ xfs_setattr_size(
newsize);
if (error)
return error;
- error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
if (error)
@@ -1028,11 +939,8 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- if (iattr->ia_valid & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
@@ -1250,14 +1158,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
-static const struct inode_operations xfs_inline_symlink_inode_operations = {
- .get_link = xfs_vn_get_link_inline,
- .getattr = xfs_vn_getattr,
- .setattr = xfs_vn_setattr,
- .listxattr = xfs_vn_listxattr,
- .update_time = xfs_vn_update_time,
-};
-
/* Figure out if this file actually supports DAX. */
static bool
xfs_inode_supports_dax(
@@ -1332,9 +1232,9 @@ xfs_diflags_to_iflags(
* Initialize the Linux inode.
*
* When reading existing inodes from disk this is called directly from xfs_iget,
- * when creating a new inode it is called from xfs_ialloc after setting up the
- * inode. These callers have different criteria for clearing XFS_INEW, so leave
- * it up to the caller to deal with unlocking the inode appropriately.
+ * when creating a new inode it is called from xfs_init_new_inode after setting
+ * up the inode. These callers have different criteria for clearing XFS_INEW, so
+ * leave it up to the caller to deal with unlocking the inode appropriately.
*/
void
xfs_setup_inode(
@@ -1408,10 +1308,7 @@ xfs_setup_iops(
inode->i_fop = &xfs_dir_file_operations;
break;
case S_IFLNK:
- if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL)
- inode->i_op = &xfs_inline_symlink_inode_operations;
- else
- inode->i_op = &xfs_symlink_inode_operations;
+ inode->i_op = &xfs_symlink_inode_operations;
break;
default:
inode->i_op = &xfs_inode_operations;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index c174262a074e..cb9105d667db 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -61,6 +61,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/ratelimit.h>
#include <linux/rhashtable.h>
#include <linux/xattr.h>
+#include <linux/mnt_idmapping.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -196,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, unsigned int op);
-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
- struct completion *done);
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6cd2d4aa770..499e15b24215 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,7 +21,7 @@
#include "xfs_sb.h"
#include "xfs_health.h"
-kmem_zone_t *xfs_log_ticket_zone;
+struct kmem_cache *xfs_log_ticket_cache;
/* Local miscellaneous function prototypes */
STATIC struct xlog *
@@ -487,7 +487,10 @@ out_error:
* Run all the pending iclog callbacks and wake log force waiters and iclog
* space waiters so they can process the newly set shutdown state. We really
* don't care what order we process callbacks here because the log is shut down
- * and so state cannot change on disk anymore.
+ * and so state cannot change on disk anymore. However, we cannot wake waiters
+ * until the callbacks have been processed because we may be in unmount and
+ * we must ensure that all AIL operations the callbacks perform have completed
+ * before we tear down the AIL.
*
* We avoid processing actively referenced iclogs so that we don't run callbacks
* while the iclog owner might still be preparing the iclog for IO submssion.
@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks(
struct xlog_in_core *iclog;
LIST_HEAD(cb_list);
- spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
do {
if (atomic_read(&iclog->ic_refcnt)) {
@@ -509,26 +511,22 @@ xlog_state_shutdown_callbacks(
continue;
}
list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
+
+ xlog_cil_process_committed(&cb_list);
+
+ spin_lock(&log->l_icloglock);
wake_up_all(&iclog->ic_write_wait);
wake_up_all(&iclog->ic_force_wait);
} while ((iclog = iclog->ic_next) != log->l_iclog);
wake_up_all(&log->l_flush_wait);
- spin_unlock(&log->l_icloglock);
-
- xlog_cil_process_committed(&cb_list);
}
/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
*
- * If the caller passes in a non-zero @old_tail_lsn and the current log tail
- * does not match, there may be metadata on disk that must be persisted before
- * this iclog is written. To satisfy that requirement, set the
- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
- * log tail value.
- *
* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
* log tail is updated correctly. NEED_FUA indicates that the iclog will be
* written to stable storage, and implies that a commit record is contained
@@ -545,12 +543,10 @@ xlog_state_shutdown_callbacks(
* always capture the tail lsn on the iclog on the first NEED_FUA release
* regardless of the number of active reference counts on this iclog.
*/
-
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t old_tail_lsn)
+ struct xlog_in_core *iclog)
{
xfs_lsn_t tail_lsn;
bool last_ref;
@@ -561,18 +557,14 @@ xlog_state_release_iclog(
/*
* Grabbing the current log tail needs to be atomic w.r.t. the writing
* of the tail LSN into the iclog so we guarantee that the log tail does
- * not move between deciding if a cache flush is required and writing
- * the LSN into the iclog below.
+ * not move between the first time we know that the iclog needs to be
+ * made stable and when we eventually submit it.
*/
- if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
+ !iclog->ic_header.h_tail_lsn) {
tail_lsn = xlog_assign_tail_lsn(log->l_mp);
-
- if (old_tail_lsn && tail_lsn != old_tail_lsn)
- iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
-
- if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
- !iclog->ic_header.h_tail_lsn)
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
}
last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
@@ -583,11 +575,8 @@ xlog_state_release_iclog(
* pending iclog callbacks that were waiting on the release of
* this iclog.
*/
- if (last_ref) {
- spin_unlock(&log->l_icloglock);
+ if (last_ref)
xlog_state_shutdown_callbacks(log);
- spin_lock(&log->l_icloglock);
- }
return -EIO;
}
@@ -600,8 +589,6 @@ xlog_state_release_iclog(
}
iclog->ic_state = XLOG_STATE_SYNCING;
- if (!iclog->ic_header.h_tail_lsn)
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
xlog_verify_tail_lsn(log, iclog);
trace_xlog_iclog_syncing(iclog, _RET_IP_);
@@ -812,10 +799,9 @@ xfs_log_mount_finish(
* mount failure occurs.
*/
mp->m_super->s_flags |= SB_ACTIVE;
+ xfs_log_work_queue(mp);
if (xlog_recovery_needed(log))
error = xlog_recover_finish(log);
- if (!error)
- xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~SB_ACTIVE;
evict_inodes(mp->m_super);
@@ -874,7 +860,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
- return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
+ return xlog_state_release_iclog(iclog->ic_log, iclog);
}
/*
@@ -1102,7 +1088,7 @@ xfs_log_item_init(
int type,
const struct xfs_item_ops *ops)
{
- item->li_mountp = mp;
+ item->li_log = mp->m_log;
item->li_ailp = mp->m_ail;
item->li_type = type;
item->li_ops = ops;
@@ -1374,7 +1360,7 @@ xlog_ioend_work(
*/
if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
xlog_state_done_syncing(iclog);
@@ -1883,19 +1869,19 @@ xlog_write_iclog(
return;
}
- bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
- iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
- iclog->ic_bio.bi_end_io = xlog_bio_end_io;
- iclog->ic_bio.bi_private = iclog;
-
/*
* We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
* IOs coming immediately after this one. This prevents the block layer
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ howmany(count, PAGE_SIZE),
+ REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
/*
@@ -1913,7 +1899,7 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return;
}
if (is_vmalloc_addr(iclog->ic_data))
@@ -2412,7 +2398,7 @@ xlog_write_copy_finish(
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
xlog_is_shutdown(log));
release_iclog:
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
return error;
}
@@ -2488,7 +2474,7 @@ xlog_write(
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
xlog_print_tic_res(log->l_mp, ticket);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
len = xlog_write_calc_vec_length(ticket, log_vector, optype);
@@ -2629,7 +2615,7 @@ next_lv:
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
return error;
@@ -3053,7 +3039,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3487,7 +3473,7 @@ xfs_log_ticket_put(
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_cache_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_cache, ticket);
}
xlog_ticket_t *
@@ -3611,7 +3597,7 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes);
@@ -3822,9 +3808,10 @@ xlog_verify_iclog(
#endif
/*
- * Perform a forced shutdown on the log. This should be called once and once
- * only by the high level filesystem shutdown code to shut the log subsystem
- * down cleanly.
+ * Perform a forced shutdown on the log.
+ *
+ * This can be called from low level log code to trigger a shutdown, or from the
+ * high level mount shutdown code when the mount shuts down.
*
* Our main objectives here are to make sure that:
* a. if the shutdown was not due to a log IO error, flush the logs to
@@ -3833,6 +3820,8 @@ xlog_verify_iclog(
* parties to find out. Nothing new gets queued after this is done.
* c. Tasks sleeping on log reservations, pinned objects and
* other resources get woken up.
+ * d. The mount is also marked as shut down so that log triggered shutdowns
+ * still behave the same as if they called xfs_forced_shutdown().
*
* Return true if the shutdown cause was a log IO error and we actually shut the
* log down.
@@ -3844,25 +3833,25 @@ xlog_force_shutdown(
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
- /*
- * If this happens during log recovery then we aren't using the runtime
- * log mechanisms yet so there's nothing to shut down.
- */
- if (!log || xlog_in_recovery(log))
+ if (!log)
return false;
- ASSERT(!xlog_is_shutdown(log));
-
/*
* Flush all the completed transactions to disk before marking the log
* being shut down. We need to do this first as shutting down the log
* before the force will prevent the log force from flushing the iclogs
* to disk.
*
- * Re-entry due to a log IO error shutdown during the log force is
- * prevented by the atomicity of higher level shutdown code.
+ * When we are in recovery, there are no transactions to flush, and
+ * we don't want to touch the log because we don't want to perturb the
+ * current head/tail for future recovery attempts. Hence we need to
+ * avoid a log force in this case.
+ *
+ * If we are shutting down due to a log IO error, then we must avoid
+ * trying to write the log as that may just result in more IO errors and
+ * an endless shutdown/force loop.
*/
- if (!log_error)
+ if (!log_error && !xlog_in_recovery(log))
xfs_log_force(log->l_mp, XFS_LOG_SYNC);
/*
@@ -3879,12 +3868,25 @@ xlog_force_shutdown(
spin_lock(&log->l_icloglock);
if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
spin_unlock(&log->l_icloglock);
- ASSERT(0);
return false;
}
spin_unlock(&log->l_icloglock);
/*
+ * If this log shutdown also sets the mount shutdown state, issue a
+ * shutdown warning message.
+ */
+ if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+"Filesystem has been shut down due to log error (0x%x).",
+ shutdown_flags);
+ xfs_alert(log->l_mp,
+"Please unmount the filesystem and rectify the problem(s).");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+ }
+
+ /*
* We don't want anybody waiting for log reservations after this. That
* means we have to wake up everybody queued up on reserveq as well as
* writeq. In addition, we make sure in xlog_{re}grant_log_space that
@@ -3904,8 +3906,12 @@ xlog_force_shutdown(
wake_up_all(&log->l_cilp->xc_start_wait);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
+
+ spin_lock(&log->l_icloglock);
xlog_state_shutdown_callbacks(log);
+ spin_unlock(&log->l_icloglock);
+ wake_up_var(&log->l_opstate);
return log_error;
}
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 6c93c8ada6f3..ba57323bfdce 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -103,6 +103,39 @@ xlog_cil_iovec_space(
}
/*
+ * shadow buffers can be large, so we need to use kvmalloc() here to ensure
+ * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
+ * back to vmalloc, so we can't actually do anything useful with gfp flags to
+ * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
+ * direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or buddy
+ * allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * Also, we are in memalloc_nofs_save task context here, so despite the use of
+ * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
+ * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
+ * just all pretend this is a GFP_KERNEL context operation....
+ */
+static inline void *
+xlog_cil_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
+/*
* Allocate or pin log vector buffers for CIL insertion.
*
* The CIL currently uses disposable buffers for copying a snapshot of the
@@ -203,25 +236,16 @@ xlog_cil_alloc_shadow_bufs(
*/
if (!lip->li_lv_shadow ||
buf_size > lip->li_lv_shadow->lv_size) {
-
/*
* We free and allocate here as a realloc would copy
- * unnecessary data. We don't use kmem_zalloc() for the
+ * unnecessary data. We don't use kvzalloc() for the
* same reason - we don't need to zero the data area in
* the buffer, only the log vector header and the iovec
* storage.
*/
kmem_free(lip->li_lv_shadow);
+ lv = xlog_cil_kvmalloc(buf_size);
- /*
- * We are in transaction context, which means this
- * allocation will pick up GFP_NOFS from the
- * memalloc_nofs_save/restore context the transaction
- * holds. This means we can use GFP_KERNEL here so the
- * generic kvmalloc() code will run vmalloc on
- * contiguous page allocation failure as we require.
- */
- lv = kvmalloc(buf_size, GFP_KERNEL);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip;
@@ -516,7 +540,7 @@ xlog_cil_insert_items(
spin_unlock(&cil->xc_cil_lock);
if (tp->t_ticket->t_curr_res < 0)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
static void
@@ -681,11 +705,21 @@ xlog_cil_set_ctx_write_state(
* The LSN we need to pass to the log items on transaction
* commit is the LSN reported by the first log vector write, not
* the commit lsn. If we use the commit record lsn then we can
- * move the tail beyond the grant write head.
+ * move the grant write head beyond the tail LSN and overwrite
+ * it.
*/
ctx->start_lsn = lsn;
wake_up_all(&cil->xc_start_wait);
spin_unlock(&cil->xc_push_lock);
+
+ /*
+ * Make sure the metadata we are about to overwrite in the log
+ * has been flushed to stable storage before this iclog is
+ * issued.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ spin_unlock(&cil->xc_log->l_icloglock);
return;
}
@@ -830,7 +864,7 @@ xlog_cil_write_commit_record(
error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
if (error)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -864,10 +898,7 @@ xlog_cil_push_work(
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
- xfs_lsn_t preflush_tail_lsn;
xfs_csn_t push_seq;
- struct bio bio;
- DECLARE_COMPLETION_ONSTACK(bdev_flush);
bool push_commit_stable;
new_ctx = xlog_cil_ctx_alloc();
@@ -938,23 +969,6 @@ xlog_cil_push_work(
spin_unlock(&cil->xc_push_lock);
/*
- * The CIL is stable at this point - nothing new will be added to it
- * because we hold the flush lock exclusively. Hence we can now issue
- * a cache flush to ensure all the completed metadata in the journal we
- * are about to overwrite is on stable storage.
- *
- * Because we are issuing this cache flush before we've written the
- * tail lsn to the iclog, we can have metadata IO completions move the
- * tail forwards between the completion of this flush and the iclog
- * being written. In this case, we need to re-issue the cache flush
- * before the iclog write. To detect whether the log tail moves, sample
- * the tail LSN *before* we issue the flush.
- */
- preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
- xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
- &bdev_flush);
-
- /*
* Pull all the log vectors off the items in the CIL, and remove the
* items from the CIL. We don't need the CIL lock here because it's only
* needed on the transaction commit side which is currently locked out
@@ -1030,12 +1044,6 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- /*
- * Before we format and submit the first iclog, we have to ensure that
- * the metadata writeback ordering cache flush is complete.
- */
- wait_for_completion(&bdev_flush);
-
error = xlog_cil_write_chain(ctx, &lvhdr);
if (error)
goto out_abort_free_ticket;
@@ -1094,7 +1102,7 @@ xlog_cil_push_work(
if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
- xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
@@ -1115,7 +1123,7 @@ out_abort_free_ticket:
return;
}
spin_lock(&log->l_icloglock);
- xlog_state_release_iclog(log, ctx->commit_iclog, 0);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
}
@@ -1219,18 +1227,27 @@ xlog_cil_push_now(
if (!async)
flush_workqueue(cil->xc_push_wq);
+ spin_lock(&cil->xc_push_lock);
+
+ /*
+ * If this is an async flush request, we always need to set the
+ * xc_push_commit_stable flag even if something else has already queued
+ * a push. The flush caller is asking for the CIL to be on stable
+ * storage when the next push completes, so regardless of who has queued
+ * the push, the flush requires stable semantics from it.
+ */
+ cil->xc_push_commit_stable = async;
+
/*
* If the CIL is empty or we've already pushed the sequence then
- * there's no work we need to do.
+ * there's no more work that we need to do.
*/
- spin_lock(&cil->xc_push_lock);
if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
- cil->xc_push_commit_stable = async;
queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock);
}
@@ -1328,6 +1345,13 @@ xlog_cil_flush(
trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
xlog_cil_push_now(log, seq, true);
+
+ /*
+ * If the CIL is empty, make sure that any previous checkpoint that may
+ * still be in an active iclog is pushed to stable storage.
+ */
+ if (list_empty(&log->l_cilp->xc_cil))
+ xfs_log_force(log->l_mp, 0);
}
/*
@@ -1442,9 +1466,9 @@ out_shutdown:
*/
bool
xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip)
{
- struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+ struct xfs_cil *cil = lip->li_log->l_cilp;
if (list_empty(&lip->li_cil))
return false;
@@ -1454,7 +1478,7 @@ xfs_log_item_in_current_chkpt(
* first checkpoint it is written to. Hence if it is different to the
* current sequence, we're in a new checkpoint.
*/
- return lip->li_seq == ctx->sequence;
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
}
/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 844fbeec3545..401cdc400980 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log)
return test_bit(XLOG_IO_ERROR, &log->l_opstate);
}
+/*
+ * Wait until the xlog_force_shutdown() has marked the log as shut down
+ * so xlog_is_shutdown() will always return true.
+ */
+static inline void
+xlog_shutdown_wait(
+ struct xlog *log)
+{
+ wait_var_event(&log->l_opstate, xlog_is_shutdown(log));
+}
+
/* common routines */
extern int
xlog_recover(
@@ -497,7 +508,7 @@ xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
-extern kmem_zone_t *xfs_log_ticket_zone;
+extern struct kmem_cache *xfs_log_ticket_cache;
struct xlog_ticket *
xlog_ticket_alloc(
struct xlog *log,
@@ -524,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
- xfs_lsn_t log_tail_lsn);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 10562ecbd9ea..c4ad4296c540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -27,7 +27,7 @@
#include "xfs_buf_item.h"
#include "xfs_ag.h"
#include "xfs_quota.h"
-
+#include "xfs_reflink.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -2466,11 +2466,11 @@ xlog_finish_defer_ops(
{
struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- struct xfs_inode *ip;
int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
struct xfs_trans_res resv;
+ struct xfs_defer_resources dres;
/*
* Create a new transaction reservation from the captured
@@ -2485,7 +2485,7 @@ xlog_finish_defer_ops(
error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -2494,13 +2494,9 @@ xlog_finish_defer_ops(
* from recovering a single intent item.
*/
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_continue(dfc, tp, &ip);
-
+ xfs_defer_ops_continue(dfc, tp, &dres);
error = xfs_trans_commit(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+ xfs_defer_resources_rele(&dres);
if (error)
return error;
}
@@ -2520,24 +2516,25 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
}
}
+
/*
* When this is called, all of the log intent items which did not have
- * corresponding log done items should be in the AIL. What we do now
- * is update the data structures associated with each one.
+ * corresponding log done items should be in the AIL. What we do now is update
+ * the data structures associated with each one.
*
- * Since we process the log intent items in normal transactions, they
- * will be removed at some point after the commit. This prevents us
- * from just walking down the list processing each one. We'll use a
- * flag in the intent item to skip those that we've already processed
- * and use the AIL iteration mechanism's generation count to try to
- * speed this up at least a bit.
+ * Since we process the log intent items in normal transactions, they will be
+ * removed at some point after the commit. This prevents us from just walking
+ * down the list processing each one. We'll use a flag in the intent item to
+ * skip those that we've already processed and use the AIL iteration mechanism's
+ * generation count to try to speed this up at least a bit.
*
- * When we start, we know that the intents are the only things in the
- * AIL. As we process them, however, other items are added to the
- * AIL.
+ * When we start, we know that the intents are the only things in the AIL. As we
+ * process them, however, other items are added to the AIL. Hence we know we
+ * have started recovery on all the pending intents when we find an non-intent
+ * item in the AIL.
*/
STATIC int
xlog_recover_process_intents(
@@ -2560,17 +2557,8 @@ xlog_recover_process_intents(
for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
lip != NULL;
lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ if (!xlog_item_is_intent(lip))
break;
- }
/*
* We should never see a redo item with a LSN higher than
@@ -2611,8 +2599,9 @@ err:
}
/*
- * A cancel occurs when the mount has failed and we're bailing out.
- * Release all pending log intent items so they don't pin the AIL.
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending log intent items that we haven't started recovery on so they don't
+ * pin the AIL.
*/
STATIC void
xlog_recover_cancel_intents(
@@ -2626,17 +2615,8 @@ xlog_recover_cancel_intents(
spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ if (!xlog_item_is_intent(lip))
break;
- }
spin_unlock(&ailp->ail_lock);
lip->li_ops->iop_release(lip);
@@ -3474,7 +3454,7 @@ xlog_recover_finish(
*/
xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -3502,6 +3482,28 @@ xlog_recover_finish(
xlog_recover_process_iunlinks(log);
xlog_recover_check_summary(log);
+
+ /*
+ * Recover any CoW staging blocks that are still referenced by the
+ * ondisk refcount metadata. During mount there cannot be any live
+ * staging extents as we have not permitted any user modifications.
+ * Therefore, it is safe to free them all right now, even on a
+ * read-only mount.
+ */
+ error = xfs_reflink_recover_cow(log->l_mp);
+ if (error) {
+ xfs_alert(log->l_mp,
+ "Failed to recover leftover CoW staging extents, err %d.",
+ error);
+ /*
+ * If we get an error here, make sure the log is shut down
+ * but return zero so that any log items committed since the
+ * end of intents processing can be pushed through the CIL
+ * and AIL.
+ */
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
+
return 0;
}
@@ -3532,8 +3534,6 @@ xlog_recover_check_summary(
uint64_t ifree;
int error;
- mp = log->l_mp;
-
freeblks = 0LL;
itotal = 0LL;
ifree = 0LL;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 06dac09eddbd..c5f153c3693f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -21,6 +21,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
@@ -567,6 +568,18 @@ xfs_mount_setup_inode_geom(
xfs_ialloc_setup_geometry(mp);
}
+/* Compute maximum possible height for per-AG btree types for this fs. */
+static inline void
+xfs_agbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int levels;
+
+ levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
+ levels = max(levels, mp->m_rmap_maxlevels);
+ mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
+}
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -638,6 +651,8 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_agbtree_compute_maxlevels(mp);
+
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
@@ -922,15 +937,6 @@ xfs_mountfs(
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- goto out_quota;
- }
-
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
@@ -941,7 +947,6 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
- out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
@@ -1142,7 +1147,7 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+ set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e091f3b3fa15..f6dc19de8322 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -128,10 +128,11 @@ typedef struct xfs_mount {
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
- uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
- uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+ uint m_alloc_maxlevels; /* max alloc btree levels */
+ uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
+ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
@@ -478,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+static inline uint64_t
+xfs_fdblocks_unavailable(
+ struct xfs_mount *mp)
+{
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 34c3b16f834f..f85e3b07ab44 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -219,7 +219,7 @@ _xfs_mru_cache_list_insert(
* When destroying or reaping, all the elements that were migrated to the reap
* list need to be deleted. For each element this involves removing it from the
* data store, removing it from the reap list, calling the client's free
- * function and deleting the element from the element zone.
+ * function and deleting the element from the element cache.
*
* We get called holding the mru->lock, which we drop and then reacquire.
* Sparse need special help with this to tell it we know what we are doing.
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 5e1d29d8b2e7..37a24f0f7cd4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -71,6 +71,40 @@ xfs_fs_get_uuid(
}
/*
+ * We cannot use file based VFS helpers such as file_modified() to update
+ * inode state as we modify the data/metadata in the inode here. Hence we have
+ * to open code the timestamp updates and SUID/SGID stripping. We also need
+ * to set the inode prealloc flag to ensure that the extents we allocate are not
+ * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
+ * is from the client to indicate that data has been written and the file size
+ * can be extended.
+ */
+static int
+xfs_fs_map_update_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+ 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ VFS_I(ip)->i_mode &= ~S_ISUID;
+ if (VFS_I(ip)->i_mode & S_IXGRP)
+ VFS_I(ip)->i_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return xfs_trans_commit(tp);
+}
+
+/*
* Get a layout for the pNFS client.
*/
int
@@ -155,7 +189,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, &imap);
+ end_fsb - offset_fsb, 0, &imap);
if (error)
goto out_unlock;
@@ -164,16 +198,18 @@ xfs_fs_map_blocks(
* that the blocks allocated and handed out to the client are
* guaranteed to be present even after a server crash.
*/
- error = xfs_update_prealloc_flags(ip,
- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
+ error = xfs_fs_map_update_inode(ip);
+ if (!error)
+ error = xfs_log_force_inode(ip);
if (error)
goto out_unlock;
+
} else {
xfs_iunlock(ip, lock_flags);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
@@ -255,7 +291,7 @@ xfs_fs_commit_blocks(
length = end - start;
if (!length)
continue;
-
+
/*
* Make sure reads through the pagecache see the new data.
*/
@@ -283,7 +319,8 @@ xfs_fs_commit_blocks(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_setattr_time(ip, iattr);
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(&init_user_ns, inode, iattr);
if (update_isize) {
i_size_write(inode, iattr->ia_size);
ip->i_disk_size = iattr->ia_size;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5608066d6e53..f165d1a3de1d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -121,8 +122,7 @@ xfs_qm_dqpurge(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
int error = -EAGAIN;
xfs_dqlock(dqp);
@@ -157,7 +157,7 @@ xfs_qm_dqpurge(
}
ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(xfs_is_shutdown(mp) ||
+ ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
@@ -172,7 +172,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(mp, xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -850,7 +850,7 @@ xfs_qm_reset_dqcounts(
*/
#ifdef DEBUG
j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 442a0f97a9d4..5bb12717ea28 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -11,7 +11,7 @@
struct xfs_inode;
-extern struct kmem_zone *xfs_qm_dqtrxzone;
+extern struct kmem_cache *xfs_dqtrx_cache;
/*
* Number of bmaps that we ask from bmapi when doing a quotacheck.
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 47fe60e1a887..7d5a31827681 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -303,13 +303,6 @@ xfs_qm_scall_setqlim(
return 0;
/*
- * We don't want to race with a quotaoff so take the quotaoff lock.
- * We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening.
- */
- mutex_lock(&q->qi_quotaofflock);
-
- /*
* Get the dquot (locked) before we start, as we need to do a
* transaction to allocate it if it doesn't exist. Once we have the
* dquot, unlock it so we can start the next transaction safely. We hold
@@ -319,7 +312,7 @@ xfs_qm_scall_setqlim(
error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
ASSERT(error != -ENOENT);
- goto out_unlock;
+ return error;
}
defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
@@ -415,8 +408,6 @@ xfs_qm_scall_setqlim(
out_rele:
xfs_qm_dqrele(dqp);
-out_unlock:
- mutex_unlock(&q->qi_quotaofflock);
return error;
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 46904b793bd4..0d868c93144d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_cui_zone;
-kmem_zone_t *xfs_cud_zone;
+struct kmem_cache *xfs_cui_cache;
+struct kmem_cache *xfs_cud_cache;
static const struct xfs_item_ops xfs_cui_item_ops;
@@ -38,7 +38,7 @@ xfs_cui_item_free(
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_cache_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_cache, cuip);
}
/*
@@ -143,7 +143,7 @@ xfs_cui_init(
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
0);
else
- cuip = kmem_cache_zalloc(xfs_cui_zone,
+ cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
@@ -204,7 +204,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_cache_free(xfs_cud_zone, cudp);
+ kmem_cache_free(xfs_cud_cache, cudp);
}
static const struct xfs_item_ops xfs_cud_item_ops = {
@@ -221,7 +221,7 @@ xfs_trans_get_cud(
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -384,7 +384,7 @@ xfs_refcount_update_finish_item(
refc->ri_blockcount = new_aglen;
return -EAGAIN;
}
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
return error;
}
@@ -404,7 +404,7 @@ xfs_refcount_update_cancel_item(
struct xfs_refcount_intent *refc;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
}
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
@@ -457,7 +457,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
unsigned int refc_type;
@@ -557,7 +557,7 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index f4f2e836540b..eb0ab13682d0 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -25,7 +25,7 @@
/* kernel only CUI/CUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_cud_log_item {
struct xfs_cud_log_format cud_format;
};
-extern struct kmem_zone *xfs_cui_zone;
-extern struct kmem_zone *xfs_cud_zone;
+extern struct kmem_cache *xfs_cui_cache;
+extern struct kmem_cache *xfs_cud_cache;
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 76355f293488..54e68e5693fd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -425,7 +425,10 @@ convert:
if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
trace_xfs_reflink_convert_cow(ip, cmap);
- return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+ return error;
out_trans_cancel:
xfs_trans_cancel(tp);
@@ -484,7 +487,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_bmap_add_free(*tpp, del.br_startblock,
+ xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
/* Roll the transaction */
@@ -749,7 +752,10 @@ xfs_reflink_end_cow(
}
/*
- * Free leftover CoW reservations that didn't get cleaned out.
+ * Free all CoW staging blocks that are still referenced by the ondisk refcount
+ * metadata. The ondisk metadata does not track which inode created the
+ * staging extent, so callers must ensure that there are no cached inodes with
+ * live CoW staging extents.
*/
int
xfs_reflink_recover_cow(
@@ -1269,8 +1275,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_zero_range(ip, isize, pos - isize, NULL);
}
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 5f0695980467..a22b2d19ef91 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_rui_zone;
-kmem_zone_t *xfs_rud_zone;
+struct kmem_cache *xfs_rui_cache;
+struct kmem_cache *xfs_rud_cache;
static const struct xfs_item_ops xfs_rui_item_ops;
@@ -38,7 +38,7 @@ xfs_rui_item_free(
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_cache_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_cache, ruip);
}
/*
@@ -141,7 +141,7 @@ xfs_rui_init(
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_cache_zalloc(xfs_rui_zone,
+ ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
@@ -227,7 +227,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_cache_free(xfs_rud_zone, rudp);
+ kmem_cache_free(xfs_rud_cache, rudp);
}
static const struct xfs_item_ops xfs_rud_item_ops = {
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
@@ -427,7 +427,7 @@ xfs_rmap_update_finish_item(
rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
state);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
return error;
}
@@ -447,7 +447,7 @@ xfs_rmap_update_cancel_item(
struct xfs_rmap_intent *rmap;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
}
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
@@ -510,7 +510,7 @@ xfs_rui_item_recover(
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
int i;
@@ -587,7 +587,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 31e6cdfff71f..802e5119eaca 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -28,7 +28,7 @@
/* kernel only RUI/RUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_rud_log_item {
struct xfs_rud_log_format rud_format;
};
-extern struct kmem_zone *xfs_rui_zone;
-extern struct kmem_zone *xfs_rud_zone;
+extern struct kmem_cache *xfs_rui_cache;
+extern struct kmem_cache *xfs_rud_cache;
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c4e0cd1c1c8c..54be9d64093e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -37,6 +37,7 @@
#include "xfs_reflink.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_defer.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -330,13 +331,34 @@ xfs_set_inode_alloc(
return xfs_is_inode32(mp) ? maxagi : agcount;
}
-static bool
-xfs_buftarg_is_dax(
- struct super_block *sb,
- struct xfs_buftarg *bt)
+static int
+xfs_setup_dax_always(
+ struct xfs_mount *mp)
{
- return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0,
- bdev_nr_sectors(bt->bt_bdev));
+ if (!mp->m_ddev_targp->bt_daxdev &&
+ (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
+ xfs_alert(mp,
+ "DAX unsupported by block device. Turning off DAX.");
+ goto disable_dax;
+ }
+
+ if (mp->m_super->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "DAX not supported for blocksize. Turning off DAX.");
+ goto disable_dax;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ xfs_alert(mp, "DAX and reflink cannot be used together!");
+ return -EINVAL;
+ }
+
+ xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ return 0;
+
+disable_dax:
+ xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
+ return 0;
}
STATIC int
@@ -369,26 +391,19 @@ STATIC void
xfs_close_devices(
struct xfs_mount *mp)
{
- struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
-
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
- struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
xfs_free_buftarg(mp->m_logdev_targp);
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
}
if (mp->m_rtdev_targp) {
struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
- struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
xfs_free_buftarg(mp->m_rtdev_targp);
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
}
xfs_free_buftarg(mp->m_ddev_targp);
- fs_put_dax(dax_ddev);
}
/*
@@ -406,8 +421,6 @@ xfs_open_devices(
struct xfs_mount *mp)
{
struct block_device *ddev = mp->m_super->s_bdev;
- struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
- struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
@@ -417,8 +430,7 @@ xfs_open_devices(
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- goto out;
- dax_logdev = fs_dax_get_by_bdev(logdev);
+ return error;
}
if (mp->m_rtname) {
@@ -432,25 +444,24 @@ xfs_open_devices(
error = -EINVAL;
goto out_close_rtdev;
}
- dax_rtdev = fs_dax_get_by_bdev(rtdev);
}
/*
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -466,14 +477,9 @@ xfs_open_devices(
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
out_close_logdev:
- if (logdev && logdev != ddev) {
+ if (logdev && logdev != ddev)
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
- }
- out:
- fs_put_dax(dax_ddev);
return error;
}
@@ -729,6 +735,7 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
+ int error;
trace_xfs_fs_sync_fs(mp, __return_address);
@@ -738,7 +745,10 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- xfs_log_force(mp, XFS_LOG_SYNC);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
@@ -805,7 +815,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
/* make sure statp->f_bfree does not underflow */
- statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0);
+ statp->f_bfree = max_t(int64_t, 0,
+ fdblocks - xfs_fdblocks_unavailable(mp));
statp->f_bavail = statp->f_bfree;
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
@@ -1592,26 +1603,9 @@ xfs_fs_fill_super(
sb->s_flags |= SB_I_VERSION;
if (xfs_has_dax_always(mp)) {
- bool rtdev_is_dax = false, datadev_is_dax;
-
- xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-
- datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp);
- if (mp->m_rtdev_targp)
- rtdev_is_dax = xfs_buftarg_is_dax(sb,
- mp->m_rtdev_targp);
- if (!rtdev_is_dax && !datadev_is_dax) {
- xfs_alert(mp,
- "DAX unsupported by block device. Turning off DAX.");
- xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
- }
- if (xfs_has_reflink(mp)) {
- xfs_alert(mp,
- "DAX and reflink cannot be used together!");
- error = -EINVAL;
+ error = xfs_setup_dax_always(mp);
+ if (error)
goto out_filestream_unmount;
- }
}
if (xfs_has_discard(mp)) {
@@ -1738,15 +1732,6 @@ xfs_remount_rw(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
-
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/
@@ -1764,7 +1749,15 @@ static int
xfs_remount_ro(
struct xfs_mount *mp)
{
- int error;
+ struct xfs_icwalk icw = {
+ .icw_flags = XFS_ICWALK_FLAG_SYNC,
+ };
+ int error;
+
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
/*
* Cancel background eofb scanning so it cannot race with the final
@@ -1772,8 +1765,13 @@ xfs_remount_ro(
*/
xfs_blockgc_stop(mp);
- /* Get rid of any leftover CoW reservations... */
- error = xfs_blockgc_free_space(mp, NULL);
+ /*
+ * Clear out all remaining COW staging extents and speculative post-EOF
+ * preallocations so that we don't leave inodes requiring inactivation
+ * cleanups during reclaim on a read-only mount. We must process every
+ * cached inode, so this requires a synchronous cache scan.
+ */
+ error = xfs_blockgc_free_space(mp, &icw);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
@@ -1839,8 +1837,6 @@ xfs_fs_reconfigure(
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
@@ -1951,196 +1947,194 @@ static struct file_system_type xfs_fs_type = {
MODULE_ALIAS_FS("xfs");
STATIC int __init
-xfs_init_zones(void)
+xfs_init_caches(void)
{
- xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ int error;
+
+ xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
- if (!xfs_log_ticket_zone)
+ if (!xfs_log_ticket_cache)
goto out;
- xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
- sizeof(struct xfs_extent_free_item),
- 0, 0, NULL);
- if (!xfs_bmap_free_item_zone)
- goto out_destroy_log_ticket_zone;
+ error = xfs_btree_init_cur_caches();
+ if (error)
+ goto out_destroy_log_ticket_cache;
- xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
- sizeof(struct xfs_btree_cur),
- 0, 0, NULL);
- if (!xfs_btree_cur_zone)
- goto out_destroy_bmap_free_item_zone;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_btree_cur_cache;
- xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
- if (!xfs_da_state_zone)
- goto out_destroy_btree_cur_zone;
+ if (!xfs_da_state_cache)
+ goto out_destroy_defer_item_cache;
- xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ xfs_ifork_cache = kmem_cache_create("xfs_ifork",
sizeof(struct xfs_ifork),
0, 0, NULL);
- if (!xfs_ifork_zone)
- goto out_destroy_da_state_zone;
+ if (!xfs_ifork_cache)
+ goto out_destroy_da_state_cache;
- xfs_trans_zone = kmem_cache_create("xfs_trans",
+ xfs_trans_cache = kmem_cache_create("xfs_trans",
sizeof(struct xfs_trans),
0, 0, NULL);
- if (!xfs_trans_zone)
- goto out_destroy_ifork_zone;
+ if (!xfs_trans_cache)
+ goto out_destroy_ifork_cache;
/*
- * The size of the zone allocated buf log item is the maximum
+ * The size of the cache-allocated buf log item is the maximum
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
sizeof(struct xfs_buf_log_item),
0, 0, NULL);
- if (!xfs_buf_item_zone)
- goto out_destroy_trans_zone;
+ if (!xfs_buf_item_cache)
+ goto out_destroy_trans_cache;
- xfs_efd_zone = kmem_cache_create("xfs_efd_item",
+ xfs_efd_cache = kmem_cache_create("xfs_efd_item",
(sizeof(struct xfs_efd_log_item) +
(XFS_EFD_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efd_zone)
- goto out_destroy_buf_item_zone;
+ if (!xfs_efd_cache)
+ goto out_destroy_buf_item_cache;
- xfs_efi_zone = kmem_cache_create("xfs_efi_item",
+ xfs_efi_cache = kmem_cache_create("xfs_efi_item",
(sizeof(struct xfs_efi_log_item) +
(XFS_EFI_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efi_zone)
- goto out_destroy_efd_zone;
+ if (!xfs_efi_cache)
+ goto out_destroy_efd_cache;
- xfs_inode_zone = kmem_cache_create("xfs_inode",
+ xfs_inode_cache = kmem_cache_create("xfs_inode",
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
xfs_fs_inode_init_once);
- if (!xfs_inode_zone)
- goto out_destroy_efi_zone;
+ if (!xfs_inode_cache)
+ goto out_destroy_efi_cache;
- xfs_ili_zone = kmem_cache_create("xfs_ili",
+ xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL);
- if (!xfs_ili_zone)
- goto out_destroy_inode_zone;
+ if (!xfs_ili_cache)
+ goto out_destroy_inode_cache;
- xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ xfs_icreate_cache = kmem_cache_create("xfs_icr",
sizeof(struct xfs_icreate_item),
0, 0, NULL);
- if (!xfs_icreate_zone)
- goto out_destroy_ili_zone;
+ if (!xfs_icreate_cache)
+ goto out_destroy_ili_cache;
- xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ xfs_rud_cache = kmem_cache_create("xfs_rud_item",
sizeof(struct xfs_rud_log_item),
0, 0, NULL);
- if (!xfs_rud_zone)
- goto out_destroy_icreate_zone;
+ if (!xfs_rud_cache)
+ goto out_destroy_icreate_cache;
- xfs_rui_zone = kmem_cache_create("xfs_rui_item",
+ xfs_rui_cache = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_rui_zone)
- goto out_destroy_rud_zone;
+ if (!xfs_rui_cache)
+ goto out_destroy_rud_cache;
- xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ xfs_cud_cache = kmem_cache_create("xfs_cud_item",
sizeof(struct xfs_cud_log_item),
0, 0, NULL);
- if (!xfs_cud_zone)
- goto out_destroy_rui_zone;
+ if (!xfs_cud_cache)
+ goto out_destroy_rui_cache;
- xfs_cui_zone = kmem_cache_create("xfs_cui_item",
+ xfs_cui_cache = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_cui_zone)
- goto out_destroy_cud_zone;
+ if (!xfs_cui_cache)
+ goto out_destroy_cud_cache;
- xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ xfs_bud_cache = kmem_cache_create("xfs_bud_item",
sizeof(struct xfs_bud_log_item),
0, 0, NULL);
- if (!xfs_bud_zone)
- goto out_destroy_cui_zone;
+ if (!xfs_bud_cache)
+ goto out_destroy_cui_cache;
- xfs_bui_zone = kmem_cache_create("xfs_bui_item",
+ xfs_bui_cache = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_bui_zone)
- goto out_destroy_bud_zone;
+ if (!xfs_bui_cache)
+ goto out_destroy_bud_cache;
return 0;
- out_destroy_bud_zone:
- kmem_cache_destroy(xfs_bud_zone);
- out_destroy_cui_zone:
- kmem_cache_destroy(xfs_cui_zone);
- out_destroy_cud_zone:
- kmem_cache_destroy(xfs_cud_zone);
- out_destroy_rui_zone:
- kmem_cache_destroy(xfs_rui_zone);
- out_destroy_rud_zone:
- kmem_cache_destroy(xfs_rud_zone);
- out_destroy_icreate_zone:
- kmem_cache_destroy(xfs_icreate_zone);
- out_destroy_ili_zone:
- kmem_cache_destroy(xfs_ili_zone);
- out_destroy_inode_zone:
- kmem_cache_destroy(xfs_inode_zone);
- out_destroy_efi_zone:
- kmem_cache_destroy(xfs_efi_zone);
- out_destroy_efd_zone:
- kmem_cache_destroy(xfs_efd_zone);
- out_destroy_buf_item_zone:
- kmem_cache_destroy(xfs_buf_item_zone);
- out_destroy_trans_zone:
- kmem_cache_destroy(xfs_trans_zone);
- out_destroy_ifork_zone:
- kmem_cache_destroy(xfs_ifork_zone);
- out_destroy_da_state_zone:
- kmem_cache_destroy(xfs_da_state_zone);
- out_destroy_btree_cur_zone:
- kmem_cache_destroy(xfs_btree_cur_zone);
- out_destroy_bmap_free_item_zone:
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- out_destroy_log_ticket_zone:
- kmem_cache_destroy(xfs_log_ticket_zone);
+ out_destroy_bud_cache:
+ kmem_cache_destroy(xfs_bud_cache);
+ out_destroy_cui_cache:
+ kmem_cache_destroy(xfs_cui_cache);
+ out_destroy_cud_cache:
+ kmem_cache_destroy(xfs_cud_cache);
+ out_destroy_rui_cache:
+ kmem_cache_destroy(xfs_rui_cache);
+ out_destroy_rud_cache:
+ kmem_cache_destroy(xfs_rud_cache);
+ out_destroy_icreate_cache:
+ kmem_cache_destroy(xfs_icreate_cache);
+ out_destroy_ili_cache:
+ kmem_cache_destroy(xfs_ili_cache);
+ out_destroy_inode_cache:
+ kmem_cache_destroy(xfs_inode_cache);
+ out_destroy_efi_cache:
+ kmem_cache_destroy(xfs_efi_cache);
+ out_destroy_efd_cache:
+ kmem_cache_destroy(xfs_efd_cache);
+ out_destroy_buf_item_cache:
+ kmem_cache_destroy(xfs_buf_item_cache);
+ out_destroy_trans_cache:
+ kmem_cache_destroy(xfs_trans_cache);
+ out_destroy_ifork_cache:
+ kmem_cache_destroy(xfs_ifork_cache);
+ out_destroy_da_state_cache:
+ kmem_cache_destroy(xfs_da_state_cache);
+ out_destroy_defer_item_cache:
+ xfs_defer_destroy_item_caches();
+ out_destroy_btree_cur_cache:
+ xfs_btree_destroy_cur_caches();
+ out_destroy_log_ticket_cache:
+ kmem_cache_destroy(xfs_log_ticket_cache);
out:
return -ENOMEM;
}
STATIC void
-xfs_destroy_zones(void)
+xfs_destroy_caches(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(xfs_bui_zone);
- kmem_cache_destroy(xfs_bud_zone);
- kmem_cache_destroy(xfs_cui_zone);
- kmem_cache_destroy(xfs_cud_zone);
- kmem_cache_destroy(xfs_rui_zone);
- kmem_cache_destroy(xfs_rud_zone);
- kmem_cache_destroy(xfs_icreate_zone);
- kmem_cache_destroy(xfs_ili_zone);
- kmem_cache_destroy(xfs_inode_zone);
- kmem_cache_destroy(xfs_efi_zone);
- kmem_cache_destroy(xfs_efd_zone);
- kmem_cache_destroy(xfs_buf_item_zone);
- kmem_cache_destroy(xfs_trans_zone);
- kmem_cache_destroy(xfs_ifork_zone);
- kmem_cache_destroy(xfs_da_state_zone);
- kmem_cache_destroy(xfs_btree_cur_zone);
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- kmem_cache_destroy(xfs_log_ticket_zone);
+ kmem_cache_destroy(xfs_bui_cache);
+ kmem_cache_destroy(xfs_bud_cache);
+ kmem_cache_destroy(xfs_cui_cache);
+ kmem_cache_destroy(xfs_cud_cache);
+ kmem_cache_destroy(xfs_rui_cache);
+ kmem_cache_destroy(xfs_rud_cache);
+ kmem_cache_destroy(xfs_icreate_cache);
+ kmem_cache_destroy(xfs_ili_cache);
+ kmem_cache_destroy(xfs_inode_cache);
+ kmem_cache_destroy(xfs_efi_cache);
+ kmem_cache_destroy(xfs_efd_cache);
+ kmem_cache_destroy(xfs_buf_item_cache);
+ kmem_cache_destroy(xfs_trans_cache);
+ kmem_cache_destroy(xfs_ifork_cache);
+ kmem_cache_destroy(xfs_da_state_cache);
+ xfs_defer_destroy_item_caches();
+ xfs_btree_destroy_cur_caches();
+ kmem_cache_destroy(xfs_log_ticket_cache);
}
STATIC int __init
@@ -2233,13 +2227,13 @@ init_xfs_fs(void)
if (error)
goto out;
- error = xfs_init_zones();
+ error = xfs_init_caches();
if (error)
goto out_destroy_hp;
error = xfs_init_workqueues();
if (error)
- goto out_destroy_zones;
+ goto out_destroy_caches;
error = xfs_mru_cache_init();
if (error)
@@ -2314,8 +2308,8 @@ init_xfs_fs(void)
xfs_mru_cache_uninit();
out_destroy_wq:
xfs_destroy_workqueues();
- out_destroy_zones:
- xfs_destroy_zones();
+ out_destroy_caches:
+ xfs_destroy_caches();
out_destroy_hp:
xfs_cpu_hotplug_destroy();
out:
@@ -2338,7 +2332,7 @@ exit_xfs_fs(void)
xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
- xfs_destroy_zones();
+ xfs_destroy_caches();
xfs_uuid_table_free();
xfs_cpu_hotplug_destroy();
}
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index fc2c6a404647..affbedf78160 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_ialloc.h"
+#include "xfs_error.h"
/* ----- Kernel only functions below ----- */
int
@@ -96,17 +97,15 @@ xfs_readlink_bmap_ilocked(
int
xfs_readlink(
- struct xfs_inode *ip,
- char *link)
+ struct xfs_inode *ip,
+ char *link)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fsize_t pathlen;
- int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t pathlen;
+ int error = -EFSCORRUPTED;
trace_xfs_readlink(ip);
- ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL);
-
if (xfs_is_shutdown(mp))
return -EIO;
@@ -121,12 +120,22 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- error = -EFSCORRUPTED;
goto out;
}
-
- error = xfs_readlink_bmap_ilocked(ip, link);
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ /*
+ * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
+ * if if_data is junk.
+ */
+ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+ goto out;
+
+ memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+ error = 0;
+ } else {
+ error = xfs_readlink_bmap_ilocked(ip, link);
+ }
out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -184,8 +193,8 @@ xfs_symlink(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
- mapped_fsgid(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 18dc5eca6c04..574b80c29fe1 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -67,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = {
static struct attribute *xfs_mp_attrs[] = {
NULL,
};
+ATTRIBUTE_GROUPS(xfs_mp);
struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_mp_attrs,
+ .default_groups = xfs_mp_groups,
};
#ifdef DEBUG
@@ -105,7 +106,7 @@ bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
}
XFS_SYSFS_ATTR_RW(bug_on_assert);
@@ -135,7 +136,7 @@ log_recovery_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
@@ -165,7 +166,7 @@ mount_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
}
XFS_SYSFS_ATTR_RW(mount_delay);
@@ -188,7 +189,7 @@ always_cow_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+ return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
}
XFS_SYSFS_ATTR_RW(always_cow);
@@ -224,7 +225,7 @@ pwork_threads_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+ return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
#endif /* DEBUG */
@@ -239,11 +240,12 @@ static struct attribute *xfs_dbg_attrs[] = {
#endif
NULL,
};
+ATTRIBUTE_GROUPS(xfs_dbg);
struct kobj_type xfs_dbg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_dbg_attrs,
+ .default_groups = xfs_dbg_groups,
};
#endif /* DEBUG */
@@ -296,11 +298,12 @@ static struct attribute *xfs_stats_attrs[] = {
ATTR_LIST(stats_clear),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_stats);
struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_stats_attrs,
+ .default_groups = xfs_stats_groups,
};
/* xlog */
@@ -327,7 +330,7 @@ log_head_lsn_show(
block = log->l_curr_block;
spin_unlock(&log->l_icloglock);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_head_lsn);
@@ -341,7 +344,7 @@ log_tail_lsn_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_tail_lsn);
@@ -356,7 +359,7 @@ reserve_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(reserve_grant_head);
@@ -370,7 +373,7 @@ write_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
@@ -381,11 +384,12 @@ static struct attribute *xfs_log_attrs[] = {
ATTR_LIST(write_grant_head),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_log);
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_log_attrs,
+ .default_groups = xfs_log_groups,
};
/*
@@ -425,7 +429,7 @@ max_retries_show(
else
retries = cfg->max_retries;
- return snprintf(buf, PAGE_SIZE, "%d\n", retries);
+ return sysfs_emit(buf, "%d\n", retries);
}
static ssize_t
@@ -466,7 +470,7 @@ retry_timeout_seconds_show(
else
timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
- return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
+ return sysfs_emit(buf, "%d\n", timeout);
}
static ssize_t
@@ -504,7 +508,7 @@ fail_at_unmount_show(
{
struct xfs_mount *mp = err_to_mp(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+ return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
}
static ssize_t
@@ -534,12 +538,12 @@ static struct attribute *xfs_error_attrs[] = {
ATTR_LIST(retry_timeout_seconds),
NULL,
};
-
+ATTRIBUTE_GROUPS(xfs_error);
static struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_error_attrs,
+ .default_groups = xfs_error_groups,
};
static struct kobj_type xfs_error_ktype = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1033a95fbf8e..b141ef78c755 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
DECLARE_EVENT_CLASS(xfs_namespace_class,
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
TP_ARGS(dp, name),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
#define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT(xfs_namespace_class, name, \
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
TP_ARGS(dp, name))
DEFINE_NAMESPACE_EVENT(xfs_remove);
DEFINE_NAMESPACE_EVENT(xfs_link);
@@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(xfs_lsn_t, new_lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -2476,7 +2476,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 67dec11e34c7..0ac717aad380 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,7 +25,7 @@
#include "xfs_dquot.h"
#include "xfs_icache.h"
-kmem_zone_t *xfs_trans_zone;
+struct kmem_cache *xfs_trans_cache;
#if defined(CONFIG_TRACEPOINTS)
static void
@@ -76,7 +76,7 @@ xfs_trans_free(
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_cache_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_cache, tp);
}
/*
@@ -95,7 +95,7 @@ xfs_trans_dup(
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
xfs_trans_set_context(tp);
@@ -477,7 +477,7 @@ STATIC void
xfs_trans_apply_sb_deltas(
xfs_trans_t *tp)
{
- xfs_dsb_t *sbp;
+ struct xfs_dsb *sbp;
struct xfs_buf *bp;
int whole = 0;
@@ -541,14 +541,14 @@ xfs_trans_apply_sb_deltas(
/*
* Log the whole thing, the fields are noncontiguous.
*/
- xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
else
/*
* Since all the modifiable fields are contiguous, we
* can get away with this.
*/
- xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
- offsetof(xfs_dsb_t, sb_frextents) +
+ xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
+ offsetof(struct xfs_dsb, sb_frextents) +
sizeof(sbp->sb_frextents) - 1);
}
@@ -648,7 +648,7 @@ xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_log == tp->t_mountp->m_log);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
ASSERT(list_empty(&lip->li_trans));
ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(xfs_is_shutdown(ailp->ail_mount));
+ ASSERT(xlog_is_shutdown(ailp->ail_log));
if (lip->li_ops->iop_unpin)
lip->li_ops->iop_unpin(lip, 1);
continue;
@@ -836,6 +836,7 @@ __xfs_trans_commit(
bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xlog *log = mp->m_log;
xfs_csn_t commit_seq = 0;
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
@@ -864,7 +865,13 @@ __xfs_trans_commit(
if (!(tp->t_flags & XFS_TRANS_DIRTY))
goto out_unreserve;
- if (xfs_is_shutdown(mp)) {
+ /*
+ * We must check against log shutdown here because we cannot abort log
+ * items and leave them dirty, inconsistent and unpinned in memory while
+ * the log is active. This leaves them open to being written back to
+ * disk, and that will lead to on-disk corruption.
+ */
+ if (xlog_is_shutdown(log)) {
error = -EIO;
goto out_unreserve;
}
@@ -878,7 +885,7 @@ __xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
+ xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -905,10 +912,10 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- if (regrant && !xlog_is_shutdown(mp->m_log))
- xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ if (regrant && !xlog_is_shutdown(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
else
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
xfs_trans_free_items(tp, !!error);
@@ -926,36 +933,56 @@ xfs_trans_commit(
}
/*
- * Unlock all of the transaction's items and free the transaction.
- * The transaction must not have modified any of its items, because
- * there is no way to restore them to their previous state.
+ * Unlock all of the transaction's items and free the transaction. If the
+ * transaction is dirty, we must shut down the filesystem because there is no
+ * way to restore them to their previous state.
*
- * If the transaction has made a log reservation, make sure to release
- * it as well.
+ * If the transaction has made a log reservation, make sure to release it as
+ * well.
+ *
+ * This is a high level function (equivalent to xfs_trans_commit()) and so can
+ * be called after the transaction has effectively been aborted due to the mount
+ * being shut down. However, if the mount has not been shut down and the
+ * transaction is dirty we will shut the mount down and, in doing so, that
+ * guarantees that the log is shut down, too. Hence we don't need to be as
+ * careful with shutdown state and dirty items here as we need to be in
+ * xfs_trans_commit().
*/
void
xfs_trans_cancel(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xlog *log = mp->m_log;
bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
trace_xfs_trans_cancel(tp, _RET_IP_);
- if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+ /*
+ * It's never valid to cancel a transaction with deferred ops attached,
+ * because the transaction is effectively dirty. Complain about this
+ * loudly before freeing the in-memory defer items.
+ */
+ if (!list_empty(&tp->t_dfops)) {
+ ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops));
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ dirty = true;
xfs_defer_cancel(tp);
+ }
/*
- * See if the caller is relying on us to shut down the
- * filesystem. This happens in paths where we detect
- * corruption and decide to give up.
+ * See if the caller is relying on us to shut down the filesystem. We
+ * only want an error report if there isn't already a shutdown in
+ * progress, so we only need to check against the mount shutdown state
+ * here.
*/
if (dirty && !xfs_is_shutdown(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!dirty && !xfs_is_shutdown(mp)) {
+ /* Log items need to be consistent until the log is shut down. */
+ if (!dirty && !xlog_is_shutdown(log)) {
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
@@ -966,7 +993,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
@@ -1201,3 +1228,89 @@ out_cancel:
xfs_trans_cancel(tp);
return error;
}
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update. If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @nospace_error will be set to a negative errno describing the space
+ * constraint we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+ struct xfs_inode *dp,
+ struct xfs_trans_res *resv,
+ struct xfs_inode *ip,
+ unsigned int *dblocks,
+ struct xfs_trans **tpp,
+ int *nospace_error)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int resblks;
+ bool retried = false;
+ int error;
+
+retry:
+ *nospace_error = 0;
+ resblks = *dblocks;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ if (error == -ENOSPC) {
+ *nospace_error = error;
+ resblks = 0;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ }
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(dp, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ if (resblks == 0)
+ goto done;
+
+ error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ *nospace_error = error;
+ resblks = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_cancel;
+
+done:
+ *tpp = tp;
+ *dblocks = resblks;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 50da47f23a07..0c82673238f4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -8,6 +8,7 @@
/* kernel only transaction subsystem defines */
+struct xlog;
struct xfs_buf;
struct xfs_buftarg;
struct xfs_efd_log_item;
@@ -31,7 +32,7 @@ struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xlog *li_log;
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
unsigned long li_flags; /* misc flags */
@@ -113,12 +114,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
#define XFS_ITEM_FLUSHING 3
/*
- * Deferred operation item relogging limits.
- */
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
-#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
-
-/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
@@ -180,7 +175,7 @@ xfs_trans_get_buf(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
@@ -243,7 +238,7 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
-extern kmem_zone_t *xfs_trans_zone;
+extern struct kmem_cache *xfs_trans_cache;
static inline struct xfs_log_item *
xfs_trans_item_relog(
@@ -265,6 +260,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+ struct xfs_inode *ip, unsigned int *dblocks,
+ struct xfs_trans **tpp, int *nospace_error);
static inline void
xfs_trans_set_context(
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2a8c8dc54c95..d3a97a028560 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -398,7 +398,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
@@ -418,7 +418,7 @@ static long
xfsaild_push(
struct xfs_ail *ailp)
{
- xfs_mount_t *mp = ailp->ail_mount;
+ struct xfs_mount *mp = ailp->ail_log->l_mp;
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
@@ -443,15 +443,27 @@ xfsaild_push(
ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
- xlog_cil_flush(mp->m_log);
+ xlog_cil_flush(ailp->ail_log);
}
spin_lock(&ailp->ail_lock);
- /* barrier matches the ail_target update in xfs_ail_push() */
- smp_rmb();
- target = ailp->ail_target;
- ailp->ail_target_prev = target;
+ /*
+ * If we have a sync push waiter, we always have to push till the AIL is
+ * empty. Update the target to point to the end of the AIL so that
+ * capture updates that occur after the sync push waiter has gone to
+ * sleep.
+ */
+ if (waitqueue_active(&ailp->ail_empty)) {
+ lip = xfs_ail_max(ailp);
+ if (lip)
+ target = lip->li_lsn;
+ } else {
+ /* barrier matches the ail_target update in xfs_ail_push() */
+ smp_rmb();
+ target = ailp->ail_target;
+ ailp->ail_target_prev = target;
+ }
/* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
@@ -620,7 +632,7 @@ xfsaild(
* opportunity to release such buffers from the queue.
*/
ASSERT(list_empty(&ailp->ail_buf_list) ||
- xfs_is_shutdown(ailp->ail_mount));
+ xlog_is_shutdown(ailp->ail_log));
xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -683,7 +695,7 @@ xfs_ail_push(
struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
- if (!lip || xfs_is_shutdown(ailp->ail_mount) ||
+ if (!lip || xlog_is_shutdown(ailp->ail_log) ||
XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
@@ -724,7 +736,6 @@ xfs_ail_push_all_sync(
spin_lock(&ailp->ail_lock);
while ((lip = xfs_ail_max(ailp)) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
- ailp->ail_target = lip->li_lsn;
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
schedule();
@@ -740,7 +751,7 @@ xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xlog *log = ailp->ail_log;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
@@ -748,13 +759,13 @@ xfs_ail_update_finish(
return;
}
- if (!xfs_is_shutdown(mp))
- xlog_assign_tail_lsn_locked(mp);
+ if (!xlog_is_shutdown(log))
+ xlog_assign_tail_lsn_locked(log->l_mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
- xfs_log_space_wake(mp);
+ xfs_log_space_wake(log->l_mp);
}
/*
@@ -862,17 +873,17 @@ xfs_trans_ail_delete(
int shutdown_type)
{
struct xfs_ail *ailp = lip->li_ailp;
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xlog *log = ailp->ail_log;
xfs_lsn_t tail_lsn;
spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (shutdown_type && !xfs_is_shutdown(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ if (shutdown_type && !xlog_is_shutdown(log)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
- xfs_force_shutdown(mp, shutdown_type);
+ xlog_force_shutdown(log, shutdown_type);
}
return;
}
@@ -893,7 +904,7 @@ xfs_trans_ail_init(
if (!ailp)
return -ENOMEM;
- ailp->ail_mount = mp;
+ ailp->ail_log = mp->m_log;
INIT_LIST_HEAD(&ailp->ail_head);
INIT_LIST_HEAD(&ailp->ail_cursors);
spin_lock_init(&ailp->ail_lock);
@@ -901,7 +912,7 @@ xfs_trans_ail_init(
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_super->s_id);
+ mp->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 3872ce671411..9ba7e6b9bed3 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -846,7 +846,7 @@ STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone,
+ tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -856,6 +856,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3004aeac9110..f0d79a9050ba 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -6,6 +6,7 @@
#ifndef __XFS_TRANS_PRIV_H__
#define __XFS_TRANS_PRIV_H__
+struct xlog;
struct xfs_log_item;
struct xfs_mount;
struct xfs_trans;
@@ -50,7 +51,7 @@ struct xfs_ail_cursor {
* Eventually we need to drive the locking in here as well.
*/
struct xfs_ail {
- struct xfs_mount *ail_mount;
+ struct xlog *ail_log;
struct task_struct *ail_task;
struct list_head ail_head;
xfs_lsn_t ail_target;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index ddc346a9df9b..e20e7c841489 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -35,6 +35,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode,
lockdep_assert_held(&zi->i_truncate_mutex);
+ /*
+ * With ZNS drives, closing an explicitly open zone that has not been
+ * written will change the zone state to "closed", that is, the zone
+ * will remain active. Since this can then cause failure of explicit
+ * open operation on other zones if the drive active zone resources
+ * are exceeded, make sure that the zone does not remain active by
+ * resetting it.
+ */
+ if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ op = REQ_OP_ZONE_RESET;
+
trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
@@ -185,9 +196,9 @@ static const struct address_space_operations zonefs_file_aops = {
.readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.migratepage = iomap_migrate_page,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -692,12 +703,10 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc(GFP_NOFS, nr_pages);
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_pages,
+ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
- bio->bi_write_hint = iocb->ki_hint;
bio->bi_ioprio = iocb->ki_ioprio;
- bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
@@ -852,7 +861,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0);
+ &zonefs_write_dio_ops, 0, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -987,7 +996,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0);
+ &zonefs_read_dio_ops, 0, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -1128,7 +1137,7 @@ static const struct file_operations zonefs_file_operations = {
.write_iter = zonefs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
};
static struct kmem_cache *zonefs_inode_cachep;
@@ -1137,13 +1146,14 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
{
struct zonefs_inode_info *zi;
- zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL);
+ zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
if (!zi)
return NULL;
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
zi->i_wr_refcnt = 0;
+ zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1295,12 +1305,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
inc_nlink(parent);
}
-static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+ enum zonefs_ztype type)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@@ -1325,6 +1336,22 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+
+ /*
+ * For sequential zones, make sure that any open zone is closed first
+ * to ensure that the initial number of open zones is 0, in sync with
+ * the open zone accounting done when the mount option
+ * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (type == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ mutex_lock(&zi->i_truncate_mutex);
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+ return ret;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@@ -1334,6 +1361,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
+ int ret;
dentry = d_alloc_name(parent, name);
if (!dentry)
@@ -1344,10 +1372,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
goto dput;
inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone)
- zonefs_init_file_inode(inode, zone, type);
- else
+ if (zone) {
+ ret = zonefs_init_file_inode(inode, zone, type);
+ if (ret) {
+ iput(inode);
+ goto dput;
+ }
+ } else {
zonefs_init_dir_inode(dir, inode, type);
+ }
+
d_add(dentry, inode);
dir->i_size++;
@@ -1541,10 +1575,8 @@ static int zonefs_read_super(struct super_block *sb)
if (!page)
return -ENOMEM;
- bio_init(&bio, &bio_vec, 1);
+ bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = 0;
- bio.bi_opf = REQ_OP_READ;
- bio_set_dev(&bio, sb->s_bdev);
bio_add_page(&bio, page, PAGE_SIZE, 0);
ret = submit_bio_wait(&bio);
@@ -1787,5 +1819,6 @@ static void __exit zonefs_exit(void)
MODULE_AUTHOR("Damien Le Moal");
MODULE_DESCRIPTION("Zone file system for zoned block devices");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS("zonefs");
module_init(zonefs_init);
module_exit(zonefs_exit);